CoCalc -- utf8.go

GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/vendor/github.com/pelletier/go-toml/v2/internal/characters/utf8.go
²⁹⁰⁷ views
1
package characters
2

3
import (
4
	"unicode/utf8"
5
)
6

7
type utf8Err struct {
8
	Index int
9
	Size  int
10
}
11

12
func (u utf8Err) Zero() bool {
13
	return u.Size == 0
14
}
15

16
// Verified that a given string is only made of valid UTF-8 characters allowed
17
// by the TOML spec:
18
//
19
// Any Unicode character may be used except those that must be escaped:
20
// quotation mark, backslash, and the control characters other than tab (U+0000
21
// to U+0008, U+000A to U+001F, U+007F).
22
//
23
// It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
24
// when a character is not allowed.
25
//
26
// The returned utf8Err is Zero() if the string is valid, or contains the byte
27
// index and size of the invalid character.
28
//
29
// quotation mark => already checked
30
// backslash => already checked
31
// 0-0x8 => invalid
32
// 0x9 => tab, ok
33
// 0xA - 0x1F => invalid
34
// 0x7F => invalid
35
func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
36
	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
37
	offset := 0
38
	for len(p) >= 8 {
39
		// Combining two 32 bit loads allows the same code to be used
40
		// for 32 and 64 bit platforms.
41
		// The compiler can generate a 32bit load for first32 and second32
42
		// on many platforms. See test/codegen/memcombine.go.
43
		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
44
		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
45
		if (first32|second32)&0x80808080 != 0 {
46
			// Found a non ASCII byte (>= RuneSelf).
47
			break
48
		}
49

50
		for i, b := range p[:8] {
51
			if InvalidAscii(b) {
52
				err.Index = offset + i
53
				err.Size = 1
54
				return
55
			}
56
		}
57

58
		p = p[8:]
59
		offset += 8
60
	}
61
	n := len(p)
62
	for i := 0; i < n; {
63
		pi := p[i]
64
		if pi < utf8.RuneSelf {
65
			if InvalidAscii(pi) {
66
				err.Index = offset + i
67
				err.Size = 1
68
				return
69
			}
70
			i++
71
			continue
72
		}
73
		x := first[pi]
74
		if x == xx {
75
			// Illegal starter byte.
76
			err.Index = offset + i
77
			err.Size = 1
78
			return
79
		}
80
		size := int(x & 7)
81
		if i+size > n {
82
			// Short or invalid.
83
			err.Index = offset + i
84
			err.Size = n - i
85
			return
86
		}
87
		accept := acceptRanges[x>>4]
88
		if c := p[i+1]; c < accept.lo || accept.hi < c {
89
			err.Index = offset + i
90
			err.Size = 2
91
			return
92
		} else if size == 2 {
93
		} else if c := p[i+2]; c < locb || hicb < c {
94
			err.Index = offset + i
95
			err.Size = 3
96
			return
97
		} else if size == 3 {
98
		} else if c := p[i+3]; c < locb || hicb < c {
99
			err.Index = offset + i
100
			err.Size = 4
101
			return
102
		}
103
		i += size
104
	}
105
	return
106
}
107

108
// Return the size of the next rune if valid, 0 otherwise.
109
func Utf8ValidNext(p []byte) int {
110
	c := p[0]
111

112
	if c < utf8.RuneSelf {
113
		if InvalidAscii(c) {
114
			return 0
115
		}
116
		return 1
117
	}
118

119
	x := first[c]
120
	if x == xx {
121
		// Illegal starter byte.
122
		return 0
123
	}
124
	size := int(x & 7)
125
	if size > len(p) {
126
		// Short or invalid.
127
		return 0
128
	}
129
	accept := acceptRanges[x>>4]
130
	if c := p[1]; c < accept.lo || accept.hi < c {
131
		return 0
132
	} else if size == 2 {
133
	} else if c := p[2]; c < locb || hicb < c {
134
		return 0
135
	} else if size == 3 {
136
	} else if c := p[3]; c < locb || hicb < c {
137
		return 0
138
	}
139

140
	return size
141
}
142

143
// acceptRange gives the range of valid values for the second byte in a UTF-8
144
// sequence.
145
type acceptRange struct {
146
	lo uint8 // lowest value for second byte.
147
	hi uint8 // highest value for second byte.
148
}
149

150
// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
151
var acceptRanges = [16]acceptRange{
152
	0: {locb, hicb},
153
	1: {0xA0, hicb},
154
	2: {locb, 0x9F},
155
	3: {0x90, hicb},
156
	4: {locb, 0x8F},
157
}
158

159
// first is information about the first byte in a UTF-8 sequence.
160
var first = [256]uint8{
161
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
162
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
163
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
164
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
165
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
166
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
167
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
168
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
169
	as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
170
	//   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
171
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
172
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
173
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
174
	xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
175
	xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
176
	s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
177
	s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
178
	s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
179
}
180

181
const (
182
	// The default lowest and highest continuation byte.
183
	locb = 0b10000000
184
	hicb = 0b10111111
185

186
	// These names of these constants are chosen to give nice alignment in the
187
	// table below. The first nibble is an index into acceptRanges or F for
188
	// special one-byte cases. The second nibble is the Rune length or the
189
	// Status for the special one-byte case.
190
	xx = 0xF1 // invalid: size 1
191
	as = 0xF0 // ASCII: size 1
192
	s1 = 0x02 // accept 0, size 2
193
	s2 = 0x13 // accept 1, size 3
194
	s3 = 0x03 // accept 0, size 3
195
	s4 = 0x23 // accept 2, size 3
196
	s5 = 0x34 // accept 3, size 4
197
	s6 = 0x04 // accept 0, size 4
198
	s7 = 0x44 // accept 4, size 4
199
)
200

201
Product

Resources

Company