Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kardolus
GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/vendor/golang.org/x/text/internal/language/parse.go
2893 views
1
// Copyright 2013 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
package language
6
7
import (
8
"bytes"
9
"errors"
10
"fmt"
11
"sort"
12
13
"golang.org/x/text/internal/tag"
14
)
15
16
// isAlpha returns true if the byte is not a digit.
17
// b must be an ASCII letter or digit.
18
func isAlpha(b byte) bool {
19
return b > '9'
20
}
21
22
// isAlphaNum returns true if the string contains only ASCII letters or digits.
23
func isAlphaNum(s []byte) bool {
24
for _, c := range s {
25
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
26
return false
27
}
28
}
29
return true
30
}
31
32
// ErrSyntax is returned by any of the parsing functions when the
33
// input is not well-formed, according to BCP 47.
34
// TODO: return the position at which the syntax error occurred?
35
var ErrSyntax = errors.New("language: tag is not well-formed")
36
37
// ErrDuplicateKey is returned when a tag contains the same key twice with
38
// different values in the -u section.
39
var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
40
41
// ValueError is returned by any of the parsing functions when the
42
// input is well-formed but the respective subtag is not recognized
43
// as a valid value.
44
type ValueError struct {
45
v [8]byte
46
}
47
48
// NewValueError creates a new ValueError.
49
func NewValueError(tag []byte) ValueError {
50
var e ValueError
51
copy(e.v[:], tag)
52
return e
53
}
54
55
func (e ValueError) tag() []byte {
56
n := bytes.IndexByte(e.v[:], 0)
57
if n == -1 {
58
n = 8
59
}
60
return e.v[:n]
61
}
62
63
// Error implements the error interface.
64
func (e ValueError) Error() string {
65
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
66
}
67
68
// Subtag returns the subtag for which the error occurred.
69
func (e ValueError) Subtag() string {
70
return string(e.tag())
71
}
72
73
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
74
type scanner struct {
75
b []byte
76
bytes [max99thPercentileSize]byte
77
token []byte
78
start int // start position of the current token
79
end int // end position of the current token
80
next int // next point for scan
81
err error
82
done bool
83
}
84
85
func makeScannerString(s string) scanner {
86
scan := scanner{}
87
if len(s) <= len(scan.bytes) {
88
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
89
} else {
90
scan.b = []byte(s)
91
}
92
scan.init()
93
return scan
94
}
95
96
// makeScanner returns a scanner using b as the input buffer.
97
// b is not copied and may be modified by the scanner routines.
98
func makeScanner(b []byte) scanner {
99
scan := scanner{b: b}
100
scan.init()
101
return scan
102
}
103
104
func (s *scanner) init() {
105
for i, c := range s.b {
106
if c == '_' {
107
s.b[i] = '-'
108
}
109
}
110
s.scan()
111
}
112
113
// restToLower converts the string between start and end to lower case.
114
func (s *scanner) toLower(start, end int) {
115
for i := start; i < end; i++ {
116
c := s.b[i]
117
if 'A' <= c && c <= 'Z' {
118
s.b[i] += 'a' - 'A'
119
}
120
}
121
}
122
123
func (s *scanner) setError(e error) {
124
if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
125
s.err = e
126
}
127
}
128
129
// resizeRange shrinks or grows the array at position oldStart such that
130
// a new string of size newSize can fit between oldStart and oldEnd.
131
// Sets the scan point to after the resized range.
132
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133
s.start = oldStart
134
if end := oldStart + newSize; end != oldEnd {
135
diff := end - oldEnd
136
var b []byte
137
if n := len(s.b) + diff; n > cap(s.b) {
138
b = make([]byte, n)
139
copy(b, s.b[:oldStart])
140
} else {
141
b = s.b[:n]
142
}
143
copy(b[end:], s.b[oldEnd:])
144
s.b = b
145
s.next = end + (s.next - s.end)
146
s.end = end
147
}
148
}
149
150
// replace replaces the current token with repl.
151
func (s *scanner) replace(repl string) {
152
s.resizeRange(s.start, s.end, len(repl))
153
copy(s.b[s.start:], repl)
154
}
155
156
// gobble removes the current token from the input.
157
// Caller must call scan after calling gobble.
158
func (s *scanner) gobble(e error) {
159
s.setError(e)
160
if s.start == 0 {
161
s.b = s.b[:+copy(s.b, s.b[s.next:])]
162
s.end = 0
163
} else {
164
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
165
s.end = s.start - 1
166
}
167
s.next = s.start
168
}
169
170
// deleteRange removes the given range from s.b before the current token.
171
func (s *scanner) deleteRange(start, end int) {
172
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
173
diff := end - start
174
s.next -= diff
175
s.start -= diff
176
s.end -= diff
177
}
178
179
// scan parses the next token of a BCP 47 string. Tokens that are larger
180
// than 8 characters or include non-alphanumeric characters result in an error
181
// and are gobbled and removed from the output.
182
// It returns the end position of the last token consumed.
183
func (s *scanner) scan() (end int) {
184
end = s.end
185
s.token = nil
186
for s.start = s.next; s.next < len(s.b); {
187
i := bytes.IndexByte(s.b[s.next:], '-')
188
if i == -1 {
189
s.end = len(s.b)
190
s.next = len(s.b)
191
i = s.end - s.start
192
} else {
193
s.end = s.next + i
194
s.next = s.end + 1
195
}
196
token := s.b[s.start:s.end]
197
if i < 1 || i > 8 || !isAlphaNum(token) {
198
s.gobble(ErrSyntax)
199
continue
200
}
201
s.token = token
202
return end
203
}
204
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
205
s.setError(ErrSyntax)
206
s.b = s.b[:len(s.b)-1]
207
}
208
s.done = true
209
return end
210
}
211
212
// acceptMinSize parses multiple tokens of the given size or greater.
213
// It returns the end position of the last token consumed.
214
func (s *scanner) acceptMinSize(min int) (end int) {
215
end = s.end
216
s.scan()
217
for ; len(s.token) >= min; s.scan() {
218
end = s.end
219
}
220
return end
221
}
222
223
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
224
// failed it returns an error and any part of the tag that could be parsed.
225
// If parsing succeeded but an unknown value was found, it returns
226
// ValueError. The Tag returned in this case is just stripped of the unknown
227
// value. All other values are preserved. It accepts tags in the BCP 47 format
228
// and extensions to this standard defined in
229
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
230
func Parse(s string) (t Tag, err error) {
231
// TODO: consider supporting old-style locale key-value pairs.
232
if s == "" {
233
return Und, ErrSyntax
234
}
235
defer func() {
236
if recover() != nil {
237
t = Und
238
err = ErrSyntax
239
return
240
}
241
}()
242
if len(s) <= maxAltTaglen {
243
b := [maxAltTaglen]byte{}
244
for i, c := range s {
245
// Generating invalid UTF-8 is okay as it won't match.
246
if 'A' <= c && c <= 'Z' {
247
c += 'a' - 'A'
248
} else if c == '_' {
249
c = '-'
250
}
251
b[i] = byte(c)
252
}
253
if t, ok := grandfathered(b); ok {
254
return t, nil
255
}
256
}
257
scan := makeScannerString(s)
258
return parse(&scan, s)
259
}
260
261
func parse(scan *scanner, s string) (t Tag, err error) {
262
t = Und
263
var end int
264
if n := len(scan.token); n <= 1 {
265
scan.toLower(0, len(scan.b))
266
if n == 0 || scan.token[0] != 'x' {
267
return t, ErrSyntax
268
}
269
end = parseExtensions(scan)
270
} else if n >= 4 {
271
return Und, ErrSyntax
272
} else { // the usual case
273
t, end = parseTag(scan, true)
274
if n := len(scan.token); n == 1 {
275
t.pExt = uint16(end)
276
end = parseExtensions(scan)
277
} else if end < len(scan.b) {
278
scan.setError(ErrSyntax)
279
scan.b = scan.b[:end]
280
}
281
}
282
if int(t.pVariant) < len(scan.b) {
283
if end < len(s) {
284
s = s[:end]
285
}
286
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
287
t.str = s
288
} else {
289
t.str = string(scan.b)
290
}
291
} else {
292
t.pVariant, t.pExt = 0, 0
293
}
294
return t, scan.err
295
}
296
297
// parseTag parses language, script, region and variants.
298
// It returns a Tag and the end position in the input that was parsed.
299
// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
300
func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
301
var e error
302
// TODO: set an error if an unknown lang, script or region is encountered.
303
t.LangID, e = getLangID(scan.token)
304
scan.setError(e)
305
scan.replace(t.LangID.String())
306
langStart := scan.start
307
end = scan.scan()
308
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
309
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
310
// to a tag of the form <extlang>.
311
if doNorm {
312
lang, e := getLangID(scan.token)
313
if lang != 0 {
314
t.LangID = lang
315
langStr := lang.String()
316
copy(scan.b[langStart:], langStr)
317
scan.b[langStart+len(langStr)] = '-'
318
scan.start = langStart + len(langStr) + 1
319
}
320
scan.gobble(e)
321
}
322
end = scan.scan()
323
}
324
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
325
t.ScriptID, e = getScriptID(script, scan.token)
326
if t.ScriptID == 0 {
327
scan.gobble(e)
328
}
329
end = scan.scan()
330
}
331
if n := len(scan.token); n >= 2 && n <= 3 {
332
t.RegionID, e = getRegionID(scan.token)
333
if t.RegionID == 0 {
334
scan.gobble(e)
335
} else {
336
scan.replace(t.RegionID.String())
337
}
338
end = scan.scan()
339
}
340
scan.toLower(scan.start, len(scan.b))
341
t.pVariant = byte(end)
342
end = parseVariants(scan, end, t)
343
t.pExt = uint16(end)
344
return t, end
345
}
346
347
var separator = []byte{'-'}
348
349
// parseVariants scans tokens as long as each token is a valid variant string.
350
// Duplicate variants are removed.
351
func parseVariants(scan *scanner, end int, t Tag) int {
352
start := scan.start
353
varIDBuf := [4]uint8{}
354
variantBuf := [4][]byte{}
355
varID := varIDBuf[:0]
356
variant := variantBuf[:0]
357
last := -1
358
needSort := false
359
for ; len(scan.token) >= 4; scan.scan() {
360
// TODO: measure the impact of needing this conversion and redesign
361
// the data structure if there is an issue.
362
v, ok := variantIndex[string(scan.token)]
363
if !ok {
364
// unknown variant
365
// TODO: allow user-defined variants?
366
scan.gobble(NewValueError(scan.token))
367
continue
368
}
369
varID = append(varID, v)
370
variant = append(variant, scan.token)
371
if !needSort {
372
if last < int(v) {
373
last = int(v)
374
} else {
375
needSort = true
376
// There is no legal combinations of more than 7 variants
377
// (and this is by no means a useful sequence).
378
const maxVariants = 8
379
if len(varID) > maxVariants {
380
break
381
}
382
}
383
}
384
end = scan.end
385
}
386
if needSort {
387
sort.Sort(variantsSort{varID, variant})
388
k, l := 0, -1
389
for i, v := range varID {
390
w := int(v)
391
if l == w {
392
// Remove duplicates.
393
continue
394
}
395
varID[k] = varID[i]
396
variant[k] = variant[i]
397
k++
398
l = w
399
}
400
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
401
end = start - 1
402
} else {
403
scan.resizeRange(start, end, len(str))
404
copy(scan.b[scan.start:], str)
405
end = scan.end
406
}
407
}
408
return end
409
}
410
411
type variantsSort struct {
412
i []uint8
413
v [][]byte
414
}
415
416
func (s variantsSort) Len() int {
417
return len(s.i)
418
}
419
420
func (s variantsSort) Swap(i, j int) {
421
s.i[i], s.i[j] = s.i[j], s.i[i]
422
s.v[i], s.v[j] = s.v[j], s.v[i]
423
}
424
425
func (s variantsSort) Less(i, j int) bool {
426
return s.i[i] < s.i[j]
427
}
428
429
type bytesSort struct {
430
b [][]byte
431
n int // first n bytes to compare
432
}
433
434
func (b bytesSort) Len() int {
435
return len(b.b)
436
}
437
438
func (b bytesSort) Swap(i, j int) {
439
b.b[i], b.b[j] = b.b[j], b.b[i]
440
}
441
442
func (b bytesSort) Less(i, j int) bool {
443
for k := 0; k < b.n; k++ {
444
if b.b[i][k] == b.b[j][k] {
445
continue
446
}
447
return b.b[i][k] < b.b[j][k]
448
}
449
return false
450
}
451
452
// parseExtensions parses and normalizes the extensions in the buffer.
453
// It returns the last position of scan.b that is part of any extension.
454
// It also trims scan.b to remove excess parts accordingly.
455
func parseExtensions(scan *scanner) int {
456
start := scan.start
457
exts := [][]byte{}
458
private := []byte{}
459
end := scan.end
460
for len(scan.token) == 1 {
461
extStart := scan.start
462
ext := scan.token[0]
463
end = parseExtension(scan)
464
extension := scan.b[extStart:end]
465
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
466
scan.setError(ErrSyntax)
467
end = extStart
468
continue
469
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
470
scan.b = scan.b[:end]
471
return end
472
} else if ext == 'x' {
473
private = extension
474
break
475
}
476
exts = append(exts, extension)
477
}
478
sort.Sort(bytesSort{exts, 1})
479
if len(private) > 0 {
480
exts = append(exts, private)
481
}
482
scan.b = scan.b[:start]
483
if len(exts) > 0 {
484
scan.b = append(scan.b, bytes.Join(exts, separator)...)
485
} else if start > 0 {
486
// Strip trailing '-'.
487
scan.b = scan.b[:start-1]
488
}
489
return end
490
}
491
492
// parseExtension parses a single extension and returns the position of
493
// the extension end.
494
func parseExtension(scan *scanner) int {
495
start, end := scan.start, scan.end
496
switch scan.token[0] {
497
case 'u': // https://www.ietf.org/rfc/rfc6067.txt
498
attrStart := end
499
scan.scan()
500
for last := []byte{}; len(scan.token) > 2; scan.scan() {
501
if bytes.Compare(scan.token, last) != -1 {
502
// Attributes are unsorted. Start over from scratch.
503
p := attrStart + 1
504
scan.next = p
505
attrs := [][]byte{}
506
for scan.scan(); len(scan.token) > 2; scan.scan() {
507
attrs = append(attrs, scan.token)
508
end = scan.end
509
}
510
sort.Sort(bytesSort{attrs, 3})
511
copy(scan.b[p:], bytes.Join(attrs, separator))
512
break
513
}
514
last = scan.token
515
end = scan.end
516
}
517
// Scan key-type sequences. A key is of length 2 and may be followed
518
// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
519
var last, key []byte
520
for attrEnd := end; len(scan.token) == 2; last = key {
521
key = scan.token
522
end = scan.end
523
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
524
end = scan.end
525
}
526
// TODO: check key value validity
527
if bytes.Compare(key, last) != 1 || scan.err != nil {
528
// We have an invalid key or the keys are not sorted.
529
// Start scanning keys from scratch and reorder.
530
p := attrEnd + 1
531
scan.next = p
532
keys := [][]byte{}
533
for scan.scan(); len(scan.token) == 2; {
534
keyStart := scan.start
535
end = scan.end
536
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
537
end = scan.end
538
}
539
keys = append(keys, scan.b[keyStart:end])
540
}
541
sort.Stable(bytesSort{keys, 2})
542
if n := len(keys); n > 0 {
543
k := 0
544
for i := 1; i < n; i++ {
545
if !bytes.Equal(keys[k][:2], keys[i][:2]) {
546
k++
547
keys[k] = keys[i]
548
} else if !bytes.Equal(keys[k], keys[i]) {
549
scan.setError(ErrDuplicateKey)
550
}
551
}
552
keys = keys[:k+1]
553
}
554
reordered := bytes.Join(keys, separator)
555
if e := p + len(reordered); e < end {
556
scan.deleteRange(e, end)
557
end = e
558
}
559
copy(scan.b[p:], reordered)
560
break
561
}
562
}
563
case 't': // https://www.ietf.org/rfc/rfc6497.txt
564
scan.scan()
565
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
566
_, end = parseTag(scan, false)
567
scan.toLower(start, end)
568
}
569
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
570
end = scan.acceptMinSize(3)
571
}
572
case 'x':
573
end = scan.acceptMinSize(1)
574
default:
575
end = scan.acceptMinSize(2)
576
}
577
return end
578
}
579
580
// getExtension returns the name, body and end position of the extension.
581
func getExtension(s string, p int) (end int, ext string) {
582
if s[p] == '-' {
583
p++
584
}
585
if s[p] == 'x' {
586
return len(s), s[p:]
587
}
588
end = nextExtension(s, p)
589
return end, s[p:end]
590
}
591
592
// nextExtension finds the next extension within the string, searching
593
// for the -<char>- pattern from position p.
594
// In the fast majority of cases, language tags will have at most
595
// one extension and extensions tend to be small.
596
func nextExtension(s string, p int) int {
597
for n := len(s) - 3; p < n; {
598
if s[p] == '-' {
599
if s[p+2] == '-' {
600
return p
601
}
602
p += 3
603
} else {
604
p++
605
}
606
}
607
return len(s)
608
}
609
610