Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kardolus
GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/vendor/golang.org/x/text/internal/language/lookup.go
2893 views
1
// Copyright 2013 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
package language
6
7
import (
8
"bytes"
9
"fmt"
10
"sort"
11
"strconv"
12
13
"golang.org/x/text/internal/tag"
14
)
15
16
// findIndex tries to find the given tag in idx and returns a standardized error
17
// if it could not be found.
18
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
19
if !tag.FixCase(form, key) {
20
return 0, ErrSyntax
21
}
22
i := idx.Index(key)
23
if i == -1 {
24
return 0, NewValueError(key)
25
}
26
return i, nil
27
}
28
29
func searchUint(imap []uint16, key uint16) int {
30
return sort.Search(len(imap), func(i int) bool {
31
return imap[i] >= key
32
})
33
}
34
35
type Language uint16
36
37
// getLangID returns the langID of s if s is a canonical subtag
38
// or langUnknown if s is not a canonical subtag.
39
func getLangID(s []byte) (Language, error) {
40
if len(s) == 2 {
41
return getLangISO2(s)
42
}
43
return getLangISO3(s)
44
}
45
46
// TODO language normalization as well as the AliasMaps could be moved to the
47
// higher level package, but it is a bit tricky to separate the generation.
48
49
func (id Language) Canonicalize() (Language, AliasType) {
50
return normLang(id)
51
}
52
53
// normLang returns the mapped langID of id according to mapping m.
54
func normLang(id Language) (Language, AliasType) {
55
k := sort.Search(len(AliasMap), func(i int) bool {
56
return AliasMap[i].From >= uint16(id)
57
})
58
if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
59
return Language(AliasMap[k].To), AliasTypes[k]
60
}
61
return id, AliasTypeUnknown
62
}
63
64
// getLangISO2 returns the langID for the given 2-letter ISO language code
65
// or unknownLang if this does not exist.
66
func getLangISO2(s []byte) (Language, error) {
67
if !tag.FixCase("zz", s) {
68
return 0, ErrSyntax
69
}
70
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
71
return Language(i), nil
72
}
73
return 0, NewValueError(s)
74
}
75
76
const base = 'z' - 'a' + 1
77
78
func strToInt(s []byte) uint {
79
v := uint(0)
80
for i := 0; i < len(s); i++ {
81
v *= base
82
v += uint(s[i] - 'a')
83
}
84
return v
85
}
86
87
// converts the given integer to the original ASCII string passed to strToInt.
88
// len(s) must match the number of characters obtained.
89
func intToStr(v uint, s []byte) {
90
for i := len(s) - 1; i >= 0; i-- {
91
s[i] = byte(v%base) + 'a'
92
v /= base
93
}
94
}
95
96
// getLangISO3 returns the langID for the given 3-letter ISO language code
97
// or unknownLang if this does not exist.
98
func getLangISO3(s []byte) (Language, error) {
99
if tag.FixCase("und", s) {
100
// first try to match canonical 3-letter entries
101
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
102
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
103
// We treat "und" as special and always translate it to "unspecified".
104
// Note that ZZ and Zzzz are private use and are not treated as
105
// unspecified by default.
106
id := Language(i)
107
if id == nonCanonicalUnd {
108
return 0, nil
109
}
110
return id, nil
111
}
112
}
113
if i := altLangISO3.Index(s); i != -1 {
114
return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
115
}
116
n := strToInt(s)
117
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
118
return Language(n) + langNoIndexOffset, nil
119
}
120
// Check for non-canonical uses of ISO3.
121
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
122
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
123
return Language(i), nil
124
}
125
}
126
return 0, NewValueError(s)
127
}
128
return 0, ErrSyntax
129
}
130
131
// StringToBuf writes the string to b and returns the number of bytes
132
// written. cap(b) must be >= 3.
133
func (id Language) StringToBuf(b []byte) int {
134
if id >= langNoIndexOffset {
135
intToStr(uint(id)-langNoIndexOffset, b[:3])
136
return 3
137
} else if id == 0 {
138
return copy(b, "und")
139
}
140
l := lang[id<<2:]
141
if l[3] == 0 {
142
return copy(b, l[:3])
143
}
144
return copy(b, l[:2])
145
}
146
147
// String returns the BCP 47 representation of the langID.
148
// Use b as variable name, instead of id, to ensure the variable
149
// used is consistent with that of Base in which this type is embedded.
150
func (b Language) String() string {
151
if b == 0 {
152
return "und"
153
} else if b >= langNoIndexOffset {
154
b -= langNoIndexOffset
155
buf := [3]byte{}
156
intToStr(uint(b), buf[:])
157
return string(buf[:])
158
}
159
l := lang.Elem(int(b))
160
if l[3] == 0 {
161
return l[:3]
162
}
163
return l[:2]
164
}
165
166
// ISO3 returns the ISO 639-3 language code.
167
func (b Language) ISO3() string {
168
if b == 0 || b >= langNoIndexOffset {
169
return b.String()
170
}
171
l := lang.Elem(int(b))
172
if l[3] == 0 {
173
return l[:3]
174
} else if l[2] == 0 {
175
return altLangISO3.Elem(int(l[3]))[:3]
176
}
177
// This allocation will only happen for 3-letter ISO codes
178
// that are non-canonical BCP 47 language identifiers.
179
return l[0:1] + l[2:4]
180
}
181
182
// IsPrivateUse reports whether this language code is reserved for private use.
183
func (b Language) IsPrivateUse() bool {
184
return langPrivateStart <= b && b <= langPrivateEnd
185
}
186
187
// SuppressScript returns the script marked as SuppressScript in the IANA
188
// language tag repository, or 0 if there is no such script.
189
func (b Language) SuppressScript() Script {
190
if b < langNoIndexOffset {
191
return Script(suppressScript[b])
192
}
193
return 0
194
}
195
196
type Region uint16
197
198
// getRegionID returns the region id for s if s is a valid 2-letter region code
199
// or unknownRegion.
200
func getRegionID(s []byte) (Region, error) {
201
if len(s) == 3 {
202
if isAlpha(s[0]) {
203
return getRegionISO3(s)
204
}
205
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
206
return getRegionM49(int(i))
207
}
208
}
209
return getRegionISO2(s)
210
}
211
212
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
213
// or unknownRegion if this does not exist.
214
func getRegionISO2(s []byte) (Region, error) {
215
i, err := findIndex(regionISO, s, "ZZ")
216
if err != nil {
217
return 0, err
218
}
219
return Region(i) + isoRegionOffset, nil
220
}
221
222
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
223
// or unknownRegion if this does not exist.
224
func getRegionISO3(s []byte) (Region, error) {
225
if tag.FixCase("ZZZ", s) {
226
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
227
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
228
return Region(i) + isoRegionOffset, nil
229
}
230
}
231
for i := 0; i < len(altRegionISO3); i += 3 {
232
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
233
return Region(altRegionIDs[i/3]), nil
234
}
235
}
236
return 0, NewValueError(s)
237
}
238
return 0, ErrSyntax
239
}
240
241
func getRegionM49(n int) (Region, error) {
242
if 0 < n && n <= 999 {
243
const (
244
searchBits = 7
245
regionBits = 9
246
regionMask = 1<<regionBits - 1
247
)
248
idx := n >> searchBits
249
buf := fromM49[m49Index[idx]:m49Index[idx+1]]
250
val := uint16(n) << regionBits // we rely on bits shifting out
251
i := sort.Search(len(buf), func(i int) bool {
252
return buf[i] >= val
253
})
254
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
255
return Region(r & regionMask), nil
256
}
257
}
258
var e ValueError
259
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
260
return 0, e
261
}
262
263
// normRegion returns a region if r is deprecated or 0 otherwise.
264
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
265
// TODO: consider mapping split up regions to new most populous one (like CLDR).
266
func normRegion(r Region) Region {
267
m := regionOldMap
268
k := sort.Search(len(m), func(i int) bool {
269
return m[i].From >= uint16(r)
270
})
271
if k < len(m) && m[k].From == uint16(r) {
272
return Region(m[k].To)
273
}
274
return 0
275
}
276
277
const (
278
iso3166UserAssigned = 1 << iota
279
ccTLD
280
bcp47Region
281
)
282
283
func (r Region) typ() byte {
284
return regionTypes[r]
285
}
286
287
// String returns the BCP 47 representation for the region.
288
// It returns "ZZ" for an unspecified region.
289
func (r Region) String() string {
290
if r < isoRegionOffset {
291
if r == 0 {
292
return "ZZ"
293
}
294
return fmt.Sprintf("%03d", r.M49())
295
}
296
r -= isoRegionOffset
297
return regionISO.Elem(int(r))[:2]
298
}
299
300
// ISO3 returns the 3-letter ISO code of r.
301
// Note that not all regions have a 3-letter ISO code.
302
// In such cases this method returns "ZZZ".
303
func (r Region) ISO3() string {
304
if r < isoRegionOffset {
305
return "ZZZ"
306
}
307
r -= isoRegionOffset
308
reg := regionISO.Elem(int(r))
309
switch reg[2] {
310
case 0:
311
return altRegionISO3[reg[3]:][:3]
312
case ' ':
313
return "ZZZ"
314
}
315
return reg[0:1] + reg[2:4]
316
}
317
318
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
319
// is not defined for r.
320
func (r Region) M49() int {
321
return int(m49[r])
322
}
323
324
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
325
// may include private-use tags that are assigned by CLDR and used in this
326
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
327
func (r Region) IsPrivateUse() bool {
328
return r.typ()&iso3166UserAssigned != 0
329
}
330
331
type Script uint16
332
333
// getScriptID returns the script id for string s. It assumes that s
334
// is of the format [A-Z][a-z]{3}.
335
func getScriptID(idx tag.Index, s []byte) (Script, error) {
336
i, err := findIndex(idx, s, "Zzzz")
337
return Script(i), err
338
}
339
340
// String returns the script code in title case.
341
// It returns "Zzzz" for an unspecified script.
342
func (s Script) String() string {
343
if s == 0 {
344
return "Zzzz"
345
}
346
return script.Elem(int(s))
347
}
348
349
// IsPrivateUse reports whether this script code is reserved for private use.
350
func (s Script) IsPrivateUse() bool {
351
return _Qaaa <= s && s <= _Qabx
352
}
353
354
const (
355
maxAltTaglen = len("en-US-POSIX")
356
maxLen = maxAltTaglen
357
)
358
359
var (
360
// grandfatheredMap holds a mapping from legacy and grandfathered tags to
361
// their base language or index to more elaborate tag.
362
grandfatheredMap = map[[maxLen]byte]int16{
363
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
364
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
365
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
366
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
367
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
368
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
369
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
370
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
371
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
372
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
373
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
374
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
375
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
376
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
377
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
378
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
379
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
380
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
381
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
382
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
383
384
// Grandfathered tags with no modern replacement will be converted as
385
// follows:
386
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
387
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
388
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
389
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
390
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
391
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
392
393
// CLDR-specific tag.
394
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
395
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
396
}
397
398
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
399
400
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
401
)
402
403
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
404
if v, ok := grandfatheredMap[s]; ok {
405
if v < 0 {
406
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
407
}
408
t.LangID = Language(v)
409
return t, true
410
}
411
return t, false
412
}
413
414