Path: blob/main/vendor/golang.org/x/text/runes/runes.go
2880 views
// Copyright 2014 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34// Package runes provide transforms for UTF-8 encoded text.5package runes // import "golang.org/x/text/runes"67import (8"unicode"9"unicode/utf8"1011"golang.org/x/text/transform"12)1314// A Set is a collection of runes.15type Set interface {16// Contains returns true if r is contained in the set.17Contains(r rune) bool18}1920type setFunc func(rune) bool2122func (s setFunc) Contains(r rune) bool {23return s(r)24}2526// Note: using funcs here instead of wrapping types result in cleaner27// documentation and a smaller API.2829// In creates a Set with a Contains method that returns true for all runes in30// the given RangeTable.31func In(rt *unicode.RangeTable) Set {32return setFunc(func(r rune) bool { return unicode.Is(rt, r) })33}3435// NotIn creates a Set with a Contains method that returns true for all runes not36// in the given RangeTable.37func NotIn(rt *unicode.RangeTable) Set {38return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })39}4041// Predicate creates a Set with a Contains method that returns f(r).42func Predicate(f func(rune) bool) Set {43return setFunc(f)44}4546// Transformer implements the transform.Transformer interface.47type Transformer struct {48t transform.SpanningTransformer49}5051func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {52return t.t.Transform(dst, src, atEOF)53}5455func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {56return t.t.Span(b, atEOF)57}5859func (t Transformer) Reset() { t.t.Reset() }6061// Bytes returns a new byte slice with the result of converting b using t. It62// calls Reset on t. It returns nil if any error was found. This can only happen63// if an error-producing Transformer is passed to If.64func (t Transformer) Bytes(b []byte) []byte {65b, _, err := transform.Bytes(t, b)66if err != nil {67return nil68}69return b70}7172// String returns a string with the result of converting s using t. It calls73// Reset on t. It returns the empty string if any error was found. This can only74// happen if an error-producing Transformer is passed to If.75func (t Transformer) String(s string) string {76s, _, err := transform.String(t, s)77if err != nil {78return ""79}80return s81}8283// TODO:84// - Copy: copying strings and bytes in whole-rune units.85// - Validation (maybe)86// - Well-formed-ness (maybe)8788const runeErrorString = string(utf8.RuneError)8990// Remove returns a Transformer that removes runes r for which s.Contains(r).91// Illegal input bytes are replaced by RuneError before being passed to f.92func Remove(s Set) Transformer {93if f, ok := s.(setFunc); ok {94// This little trick cuts the running time of BenchmarkRemove for sets95// created by Predicate roughly in half.96// TODO: special-case RangeTables as well.97return Transformer{remove(f)}98}99return Transformer{remove(s.Contains)}100}101102// TODO: remove transform.RemoveFunc.103104type remove func(r rune) bool105106func (remove) Reset() {}107108// Span implements transform.Spanner.109func (t remove) Span(src []byte, atEOF bool) (n int, err error) {110for r, size := rune(0), 0; n < len(src); {111if r = rune(src[n]); r < utf8.RuneSelf {112size = 1113} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {114// Invalid rune.115if !atEOF && !utf8.FullRune(src[n:]) {116err = transform.ErrShortSrc117} else {118err = transform.ErrEndOfSpan119}120break121}122if t(r) {123err = transform.ErrEndOfSpan124break125}126n += size127}128return129}130131// Transform implements transform.Transformer.132func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {133for r, size := rune(0), 0; nSrc < len(src); {134if r = rune(src[nSrc]); r < utf8.RuneSelf {135size = 1136} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {137// Invalid rune.138if !atEOF && !utf8.FullRune(src[nSrc:]) {139err = transform.ErrShortSrc140break141}142// We replace illegal bytes with RuneError. Not doing so might143// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.144// The resulting byte sequence may subsequently contain runes145// for which t(r) is true that were passed unnoticed.146if !t(utf8.RuneError) {147if nDst+3 > len(dst) {148err = transform.ErrShortDst149break150}151dst[nDst+0] = runeErrorString[0]152dst[nDst+1] = runeErrorString[1]153dst[nDst+2] = runeErrorString[2]154nDst += 3155}156nSrc++157continue158}159if t(r) {160nSrc += size161continue162}163if nDst+size > len(dst) {164err = transform.ErrShortDst165break166}167for i := 0; i < size; i++ {168dst[nDst] = src[nSrc]169nDst++170nSrc++171}172}173return174}175176// Map returns a Transformer that maps the runes in the input using the given177// mapping. Illegal bytes in the input are converted to utf8.RuneError before178// being passed to the mapping func.179func Map(mapping func(rune) rune) Transformer {180return Transformer{mapper(mapping)}181}182183type mapper func(rune) rune184185func (mapper) Reset() {}186187// Span implements transform.Spanner.188func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {189for r, size := rune(0), 0; n < len(src); n += size {190if r = rune(src[n]); r < utf8.RuneSelf {191size = 1192} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {193// Invalid rune.194if !atEOF && !utf8.FullRune(src[n:]) {195err = transform.ErrShortSrc196} else {197err = transform.ErrEndOfSpan198}199break200}201if t(r) != r {202err = transform.ErrEndOfSpan203break204}205}206return n, err207}208209// Transform implements transform.Transformer.210func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {211var replacement rune212var b [utf8.UTFMax]byte213214for r, size := rune(0), 0; nSrc < len(src); {215if r = rune(src[nSrc]); r < utf8.RuneSelf {216if replacement = t(r); replacement < utf8.RuneSelf {217if nDst == len(dst) {218err = transform.ErrShortDst219break220}221dst[nDst] = byte(replacement)222nDst++223nSrc++224continue225}226size = 1227} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {228// Invalid rune.229if !atEOF && !utf8.FullRune(src[nSrc:]) {230err = transform.ErrShortSrc231break232}233234if replacement = t(utf8.RuneError); replacement == utf8.RuneError {235if nDst+3 > len(dst) {236err = transform.ErrShortDst237break238}239dst[nDst+0] = runeErrorString[0]240dst[nDst+1] = runeErrorString[1]241dst[nDst+2] = runeErrorString[2]242nDst += 3243nSrc++244continue245}246} else if replacement = t(r); replacement == r {247if nDst+size > len(dst) {248err = transform.ErrShortDst249break250}251for i := 0; i < size; i++ {252dst[nDst] = src[nSrc]253nDst++254nSrc++255}256continue257}258259n := utf8.EncodeRune(b[:], replacement)260261if nDst+n > len(dst) {262err = transform.ErrShortDst263break264}265for i := 0; i < n; i++ {266dst[nDst] = b[i]267nDst++268}269nSrc += size270}271return272}273274// ReplaceIllFormed returns a transformer that replaces all input bytes that are275// not part of a well-formed UTF-8 code sequence with utf8.RuneError.276func ReplaceIllFormed() Transformer {277return Transformer{&replaceIllFormed{}}278}279280type replaceIllFormed struct{ transform.NopResetter }281282func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {283for n < len(src) {284// ASCII fast path.285if src[n] < utf8.RuneSelf {286n++287continue288}289290r, size := utf8.DecodeRune(src[n:])291292// Look for a valid non-ASCII rune.293if r != utf8.RuneError || size != 1 {294n += size295continue296}297298// Look for short source data.299if !atEOF && !utf8.FullRune(src[n:]) {300err = transform.ErrShortSrc301break302}303304// We have an invalid rune.305err = transform.ErrEndOfSpan306break307}308return n, err309}310311func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {312for nSrc < len(src) {313// ASCII fast path.314if r := src[nSrc]; r < utf8.RuneSelf {315if nDst == len(dst) {316err = transform.ErrShortDst317break318}319dst[nDst] = r320nDst++321nSrc++322continue323}324325// Look for a valid non-ASCII rune.326if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {327if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {328err = transform.ErrShortDst329break330}331nDst += size332nSrc += size333continue334}335336// Look for short source data.337if !atEOF && !utf8.FullRune(src[nSrc:]) {338err = transform.ErrShortSrc339break340}341342// We have an invalid rune.343if nDst+3 > len(dst) {344err = transform.ErrShortDst345break346}347dst[nDst+0] = runeErrorString[0]348dst[nDst+1] = runeErrorString[1]349dst[nDst+2] = runeErrorString[2]350nDst += 3351nSrc++352}353return nDst, nSrc, err354}355356357