Path: blob/main/vendor/golang.org/x/text/internal/language/lookup.go
2893 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package language56import (7"bytes"8"fmt"9"sort"10"strconv"1112"golang.org/x/text/internal/tag"13)1415// findIndex tries to find the given tag in idx and returns a standardized error16// if it could not be found.17func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {18if !tag.FixCase(form, key) {19return 0, ErrSyntax20}21i := idx.Index(key)22if i == -1 {23return 0, NewValueError(key)24}25return i, nil26}2728func searchUint(imap []uint16, key uint16) int {29return sort.Search(len(imap), func(i int) bool {30return imap[i] >= key31})32}3334type Language uint163536// getLangID returns the langID of s if s is a canonical subtag37// or langUnknown if s is not a canonical subtag.38func getLangID(s []byte) (Language, error) {39if len(s) == 2 {40return getLangISO2(s)41}42return getLangISO3(s)43}4445// TODO language normalization as well as the AliasMaps could be moved to the46// higher level package, but it is a bit tricky to separate the generation.4748func (id Language) Canonicalize() (Language, AliasType) {49return normLang(id)50}5152// normLang returns the mapped langID of id according to mapping m.53func normLang(id Language) (Language, AliasType) {54k := sort.Search(len(AliasMap), func(i int) bool {55return AliasMap[i].From >= uint16(id)56})57if k < len(AliasMap) && AliasMap[k].From == uint16(id) {58return Language(AliasMap[k].To), AliasTypes[k]59}60return id, AliasTypeUnknown61}6263// getLangISO2 returns the langID for the given 2-letter ISO language code64// or unknownLang if this does not exist.65func getLangISO2(s []byte) (Language, error) {66if !tag.FixCase("zz", s) {67return 0, ErrSyntax68}69if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {70return Language(i), nil71}72return 0, NewValueError(s)73}7475const base = 'z' - 'a' + 17677func strToInt(s []byte) uint {78v := uint(0)79for i := 0; i < len(s); i++ {80v *= base81v += uint(s[i] - 'a')82}83return v84}8586// converts the given integer to the original ASCII string passed to strToInt.87// len(s) must match the number of characters obtained.88func intToStr(v uint, s []byte) {89for i := len(s) - 1; i >= 0; i-- {90s[i] = byte(v%base) + 'a'91v /= base92}93}9495// getLangISO3 returns the langID for the given 3-letter ISO language code96// or unknownLang if this does not exist.97func getLangISO3(s []byte) (Language, error) {98if tag.FixCase("und", s) {99// first try to match canonical 3-letter entries100for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {101if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {102// We treat "und" as special and always translate it to "unspecified".103// Note that ZZ and Zzzz are private use and are not treated as104// unspecified by default.105id := Language(i)106if id == nonCanonicalUnd {107return 0, nil108}109return id, nil110}111}112if i := altLangISO3.Index(s); i != -1 {113return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil114}115n := strToInt(s)116if langNoIndex[n/8]&(1<<(n%8)) != 0 {117return Language(n) + langNoIndexOffset, nil118}119// Check for non-canonical uses of ISO3.120for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {121if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {122return Language(i), nil123}124}125return 0, NewValueError(s)126}127return 0, ErrSyntax128}129130// StringToBuf writes the string to b and returns the number of bytes131// written. cap(b) must be >= 3.132func (id Language) StringToBuf(b []byte) int {133if id >= langNoIndexOffset {134intToStr(uint(id)-langNoIndexOffset, b[:3])135return 3136} else if id == 0 {137return copy(b, "und")138}139l := lang[id<<2:]140if l[3] == 0 {141return copy(b, l[:3])142}143return copy(b, l[:2])144}145146// String returns the BCP 47 representation of the langID.147// Use b as variable name, instead of id, to ensure the variable148// used is consistent with that of Base in which this type is embedded.149func (b Language) String() string {150if b == 0 {151return "und"152} else if b >= langNoIndexOffset {153b -= langNoIndexOffset154buf := [3]byte{}155intToStr(uint(b), buf[:])156return string(buf[:])157}158l := lang.Elem(int(b))159if l[3] == 0 {160return l[:3]161}162return l[:2]163}164165// ISO3 returns the ISO 639-3 language code.166func (b Language) ISO3() string {167if b == 0 || b >= langNoIndexOffset {168return b.String()169}170l := lang.Elem(int(b))171if l[3] == 0 {172return l[:3]173} else if l[2] == 0 {174return altLangISO3.Elem(int(l[3]))[:3]175}176// This allocation will only happen for 3-letter ISO codes177// that are non-canonical BCP 47 language identifiers.178return l[0:1] + l[2:4]179}180181// IsPrivateUse reports whether this language code is reserved for private use.182func (b Language) IsPrivateUse() bool {183return langPrivateStart <= b && b <= langPrivateEnd184}185186// SuppressScript returns the script marked as SuppressScript in the IANA187// language tag repository, or 0 if there is no such script.188func (b Language) SuppressScript() Script {189if b < langNoIndexOffset {190return Script(suppressScript[b])191}192return 0193}194195type Region uint16196197// getRegionID returns the region id for s if s is a valid 2-letter region code198// or unknownRegion.199func getRegionID(s []byte) (Region, error) {200if len(s) == 3 {201if isAlpha(s[0]) {202return getRegionISO3(s)203}204if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {205return getRegionM49(int(i))206}207}208return getRegionISO2(s)209}210211// getRegionISO2 returns the regionID for the given 2-letter ISO country code212// or unknownRegion if this does not exist.213func getRegionISO2(s []byte) (Region, error) {214i, err := findIndex(regionISO, s, "ZZ")215if err != nil {216return 0, err217}218return Region(i) + isoRegionOffset, nil219}220221// getRegionISO3 returns the regionID for the given 3-letter ISO country code222// or unknownRegion if this does not exist.223func getRegionISO3(s []byte) (Region, error) {224if tag.FixCase("ZZZ", s) {225for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {226if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {227return Region(i) + isoRegionOffset, nil228}229}230for i := 0; i < len(altRegionISO3); i += 3 {231if tag.Compare(altRegionISO3[i:i+3], s) == 0 {232return Region(altRegionIDs[i/3]), nil233}234}235return 0, NewValueError(s)236}237return 0, ErrSyntax238}239240func getRegionM49(n int) (Region, error) {241if 0 < n && n <= 999 {242const (243searchBits = 7244regionBits = 9245regionMask = 1<<regionBits - 1246)247idx := n >> searchBits248buf := fromM49[m49Index[idx]:m49Index[idx+1]]249val := uint16(n) << regionBits // we rely on bits shifting out250i := sort.Search(len(buf), func(i int) bool {251return buf[i] >= val252})253if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {254return Region(r & regionMask), nil255}256}257var e ValueError258fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)259return 0, e260}261262// normRegion returns a region if r is deprecated or 0 otherwise.263// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).264// TODO: consider mapping split up regions to new most populous one (like CLDR).265func normRegion(r Region) Region {266m := regionOldMap267k := sort.Search(len(m), func(i int) bool {268return m[i].From >= uint16(r)269})270if k < len(m) && m[k].From == uint16(r) {271return Region(m[k].To)272}273return 0274}275276const (277iso3166UserAssigned = 1 << iota278ccTLD279bcp47Region280)281282func (r Region) typ() byte {283return regionTypes[r]284}285286// String returns the BCP 47 representation for the region.287// It returns "ZZ" for an unspecified region.288func (r Region) String() string {289if r < isoRegionOffset {290if r == 0 {291return "ZZ"292}293return fmt.Sprintf("%03d", r.M49())294}295r -= isoRegionOffset296return regionISO.Elem(int(r))[:2]297}298299// ISO3 returns the 3-letter ISO code of r.300// Note that not all regions have a 3-letter ISO code.301// In such cases this method returns "ZZZ".302func (r Region) ISO3() string {303if r < isoRegionOffset {304return "ZZZ"305}306r -= isoRegionOffset307reg := regionISO.Elem(int(r))308switch reg[2] {309case 0:310return altRegionISO3[reg[3]:][:3]311case ' ':312return "ZZZ"313}314return reg[0:1] + reg[2:4]315}316317// M49 returns the UN M.49 encoding of r, or 0 if this encoding318// is not defined for r.319func (r Region) M49() int {320return int(m49[r])321}322323// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This324// may include private-use tags that are assigned by CLDR and used in this325// implementation. So IsPrivateUse and IsCountry can be simultaneously true.326func (r Region) IsPrivateUse() bool {327return r.typ()&iso3166UserAssigned != 0328}329330type Script uint16331332// getScriptID returns the script id for string s. It assumes that s333// is of the format [A-Z][a-z]{3}.334func getScriptID(idx tag.Index, s []byte) (Script, error) {335i, err := findIndex(idx, s, "Zzzz")336return Script(i), err337}338339// String returns the script code in title case.340// It returns "Zzzz" for an unspecified script.341func (s Script) String() string {342if s == 0 {343return "Zzzz"344}345return script.Elem(int(s))346}347348// IsPrivateUse reports whether this script code is reserved for private use.349func (s Script) IsPrivateUse() bool {350return _Qaaa <= s && s <= _Qabx351}352353const (354maxAltTaglen = len("en-US-POSIX")355maxLen = maxAltTaglen356)357358var (359// grandfatheredMap holds a mapping from legacy and grandfathered tags to360// their base language or index to more elaborate tag.361grandfatheredMap = map[[maxLen]byte]int16{362[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban363[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami364[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn365[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak366[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon367[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux368[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo369[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn370[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao371[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay372[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu373[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok374[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn375[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR376[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL377[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE378[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu379[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka380[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan381[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang382383// Grandfathered tags with no modern replacement will be converted as384// follows:385[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish386[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed387[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default388[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian389[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo390[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min391392// CLDR-specific tag.393[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root394[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"395}396397altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}398399altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"400)401402func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {403if v, ok := grandfatheredMap[s]; ok {404if v < 0 {405return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true406}407t.LangID = Language(v)408return t, true409}410return t, false411}412413414