Path: blob/main/vendor/github.com/pelletier/go-toml/v2/internal/characters/utf8.go
2907 views
package characters12import (3"unicode/utf8"4)56type utf8Err struct {7Index int8Size int9}1011func (u utf8Err) Zero() bool {12return u.Size == 013}1415// Verified that a given string is only made of valid UTF-8 characters allowed16// by the TOML spec:17//18// Any Unicode character may be used except those that must be escaped:19// quotation mark, backslash, and the control characters other than tab (U+000020// to U+0008, U+000A to U+001F, U+007F).21//22// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early23// when a character is not allowed.24//25// The returned utf8Err is Zero() if the string is valid, or contains the byte26// index and size of the invalid character.27//28// quotation mark => already checked29// backslash => already checked30// 0-0x8 => invalid31// 0x9 => tab, ok32// 0xA - 0x1F => invalid33// 0x7F => invalid34func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {35// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.36offset := 037for len(p) >= 8 {38// Combining two 32 bit loads allows the same code to be used39// for 32 and 64 bit platforms.40// The compiler can generate a 32bit load for first32 and second3241// on many platforms. See test/codegen/memcombine.go.42first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<2443second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<2444if (first32|second32)&0x80808080 != 0 {45// Found a non ASCII byte (>= RuneSelf).46break47}4849for i, b := range p[:8] {50if InvalidAscii(b) {51err.Index = offset + i52err.Size = 153return54}55}5657p = p[8:]58offset += 859}60n := len(p)61for i := 0; i < n; {62pi := p[i]63if pi < utf8.RuneSelf {64if InvalidAscii(pi) {65err.Index = offset + i66err.Size = 167return68}69i++70continue71}72x := first[pi]73if x == xx {74// Illegal starter byte.75err.Index = offset + i76err.Size = 177return78}79size := int(x & 7)80if i+size > n {81// Short or invalid.82err.Index = offset + i83err.Size = n - i84return85}86accept := acceptRanges[x>>4]87if c := p[i+1]; c < accept.lo || accept.hi < c {88err.Index = offset + i89err.Size = 290return91} else if size == 2 {92} else if c := p[i+2]; c < locb || hicb < c {93err.Index = offset + i94err.Size = 395return96} else if size == 3 {97} else if c := p[i+3]; c < locb || hicb < c {98err.Index = offset + i99err.Size = 4100return101}102i += size103}104return105}106107// Return the size of the next rune if valid, 0 otherwise.108func Utf8ValidNext(p []byte) int {109c := p[0]110111if c < utf8.RuneSelf {112if InvalidAscii(c) {113return 0114}115return 1116}117118x := first[c]119if x == xx {120// Illegal starter byte.121return 0122}123size := int(x & 7)124if size > len(p) {125// Short or invalid.126return 0127}128accept := acceptRanges[x>>4]129if c := p[1]; c < accept.lo || accept.hi < c {130return 0131} else if size == 2 {132} else if c := p[2]; c < locb || hicb < c {133return 0134} else if size == 3 {135} else if c := p[3]; c < locb || hicb < c {136return 0137}138139return size140}141142// acceptRange gives the range of valid values for the second byte in a UTF-8143// sequence.144type acceptRange struct {145lo uint8 // lowest value for second byte.146hi uint8 // highest value for second byte.147}148149// acceptRanges has size 16 to avoid bounds checks in the code that uses it.150var acceptRanges = [16]acceptRange{1510: {locb, hicb},1521: {0xA0, hicb},1532: {locb, 0x9F},1543: {0x90, hicb},1554: {locb, 0x8F},156}157158// first is information about the first byte in a UTF-8 sequence.159var first = [256]uint8{160// 1 2 3 4 5 6 7 8 9 A B C D E F161as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F162as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F163as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F164as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F165as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F166as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F167as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F168as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F169// 1 2 3 4 5 6 7 8 9 A B C D E F170xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F171xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F172xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF173xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF174xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF175s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF176s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF177s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF178}179180const (181// The default lowest and highest continuation byte.182locb = 0b10000000183hicb = 0b10111111184185// These names of these constants are chosen to give nice alignment in the186// table below. The first nibble is an index into acceptRanges or F for187// special one-byte cases. The second nibble is the Rune length or the188// Status for the special one-byte case.189xx = 0xF1 // invalid: size 1190as = 0xF0 // ASCII: size 1191s1 = 0x02 // accept 0, size 2192s2 = 0x13 // accept 1, size 3193s3 = 0x03 // accept 0, size 3194s4 = 0x23 // accept 2, size 3195s5 = 0x34 // accept 3, size 4196s6 = 0x04 // accept 0, size 4197s7 = 0x44 // accept 4, size 4198)199200201