Path: blob/main/vendor/golang.org/x/text/internal/language/language.go
2893 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34//go:generate go run gen.go gen_common.go -output tables.go56package language // import "golang.org/x/text/internal/language"78// TODO: Remove above NOTE after:9// - verifying that tables are dropped correctly (most notably matcher tables).1011import (12"errors"13"fmt"14"strings"15)1617const (18// maxCoreSize is the maximum size of a BCP 47 tag without variants and19// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.20maxCoreSize = 122122// max99thPercentileSize is a somewhat arbitrary buffer size that presumably23// is large enough to hold at least 99% of the BCP 47 tags.24max99thPercentileSize = 322526// maxSimpleUExtensionSize is the maximum size of a -u extension with one27// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).28maxSimpleUExtensionSize = 1429)3031// Tag represents a BCP 47 language tag. It is used to specify an instance of a32// specific language or locale. All language tag values are guaranteed to be33// well-formed. The zero value of Tag is Und.34type Tag struct {35// TODO: the following fields have the form TagTypeID. This name is chosen36// to allow refactoring the public package without conflicting with its37// Base, Script, and Region methods. Once the transition is fully completed38// the ID can be stripped from the name.3940LangID Language41RegionID Region42// TODO: we will soon run out of positions for ScriptID. Idea: instead of43// storing lang, region, and ScriptID codes, store only the compact index and44// have a lookup table from this code to its expansion. This greatly speeds45// up table lookup, speed up common variant cases.46// This will also immediately free up 3 extra bytes. Also, the pVariant47// field can now be moved to the lookup table, as the compact index uniquely48// determines the offset of a possible variant.49ScriptID Script50pVariant byte // offset in str, includes preceding '-'51pExt uint16 // offset of first extension, includes preceding '-'5253// str is the string representation of the Tag. It will only be used if the54// tag has variants or extensions.55str string56}5758// Make is a convenience wrapper for Parse that omits the error.59// In case of an error, a sensible default is returned.60func Make(s string) Tag {61t, _ := Parse(s)62return t63}6465// Raw returns the raw base language, script and region, without making an66// attempt to infer their values.67// TODO: consider removing68func (t Tag) Raw() (b Language, s Script, r Region) {69return t.LangID, t.ScriptID, t.RegionID70}7172// equalTags compares language, script and region subtags only.73func (t Tag) equalTags(a Tag) bool {74return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID75}7677// IsRoot returns true if t is equal to language "und".78func (t Tag) IsRoot() bool {79if int(t.pVariant) < len(t.str) {80return false81}82return t.equalTags(Und)83}8485// IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use86// tag.87func (t Tag) IsPrivateUse() bool {88return t.str != "" && t.pVariant == 089}9091// RemakeString is used to update t.str in case lang, script or region changed.92// It is assumed that pExt and pVariant still point to the start of the93// respective parts.94func (t *Tag) RemakeString() {95if t.str == "" {96return97}98extra := t.str[t.pVariant:]99if t.pVariant > 0 {100extra = extra[1:]101}102if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {103t.str = extra104t.pVariant = 0105t.pExt = 0106return107}108var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.109b := buf[:t.genCoreBytes(buf[:])]110if extra != "" {111diff := len(b) - int(t.pVariant)112b = append(b, '-')113b = append(b, extra...)114t.pVariant = uint8(int(t.pVariant) + diff)115t.pExt = uint16(int(t.pExt) + diff)116} else {117t.pVariant = uint8(len(b))118t.pExt = uint16(len(b))119}120t.str = string(b)121}122123// genCoreBytes writes a string for the base languages, script and region tags124// to the given buffer and returns the number of bytes written. It will never125// write more than maxCoreSize bytes.126func (t *Tag) genCoreBytes(buf []byte) int {127n := t.LangID.StringToBuf(buf[:])128if t.ScriptID != 0 {129n += copy(buf[n:], "-")130n += copy(buf[n:], t.ScriptID.String())131}132if t.RegionID != 0 {133n += copy(buf[n:], "-")134n += copy(buf[n:], t.RegionID.String())135}136return n137}138139// String returns the canonical string representation of the language tag.140func (t Tag) String() string {141if t.str != "" {142return t.str143}144if t.ScriptID == 0 && t.RegionID == 0 {145return t.LangID.String()146}147buf := [maxCoreSize]byte{}148return string(buf[:t.genCoreBytes(buf[:])])149}150151// MarshalText implements encoding.TextMarshaler.152func (t Tag) MarshalText() (text []byte, err error) {153if t.str != "" {154text = append(text, t.str...)155} else if t.ScriptID == 0 && t.RegionID == 0 {156text = append(text, t.LangID.String()...)157} else {158buf := [maxCoreSize]byte{}159text = buf[:t.genCoreBytes(buf[:])]160}161return text, nil162}163164// UnmarshalText implements encoding.TextUnmarshaler.165func (t *Tag) UnmarshalText(text []byte) error {166tag, err := Parse(string(text))167*t = tag168return err169}170171// Variants returns the part of the tag holding all variants or the empty string172// if there are no variants defined.173func (t Tag) Variants() string {174if t.pVariant == 0 {175return ""176}177return t.str[t.pVariant:t.pExt]178}179180// VariantOrPrivateUseTags returns variants or private use tags.181func (t Tag) VariantOrPrivateUseTags() string {182if t.pExt > 0 {183return t.str[t.pVariant:t.pExt]184}185return t.str[t.pVariant:]186}187188// HasString reports whether this tag defines more than just the raw189// components.190func (t Tag) HasString() bool {191return t.str != ""192}193194// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a195// specific language are substituted with fields from the parent language.196// The parent for a language may change for newer versions of CLDR.197func (t Tag) Parent() Tag {198if t.str != "" {199// Strip the variants and extensions.200b, s, r := t.Raw()201t = Tag{LangID: b, ScriptID: s, RegionID: r}202if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {203base, _ := addTags(Tag{LangID: t.LangID})204if base.ScriptID == t.ScriptID {205return Tag{LangID: t.LangID}206}207}208return t209}210if t.LangID != 0 {211if t.RegionID != 0 {212maxScript := t.ScriptID213if maxScript == 0 {214max, _ := addTags(t)215maxScript = max.ScriptID216}217218for i := range parents {219if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {220for _, r := range parents[i].fromRegion {221if Region(r) == t.RegionID {222return Tag{223LangID: t.LangID,224ScriptID: Script(parents[i].script),225RegionID: Region(parents[i].toRegion),226}227}228}229}230}231232// Strip the script if it is the default one.233base, _ := addTags(Tag{LangID: t.LangID})234if base.ScriptID != maxScript {235return Tag{LangID: t.LangID, ScriptID: maxScript}236}237return Tag{LangID: t.LangID}238} else if t.ScriptID != 0 {239// The parent for an base-script pair with a non-default script is240// "und" instead of the base language.241base, _ := addTags(Tag{LangID: t.LangID})242if base.ScriptID != t.ScriptID {243return Und244}245return Tag{LangID: t.LangID}246}247}248return Und249}250251// ParseExtension parses s as an extension and returns it on success.252func ParseExtension(s string) (ext string, err error) {253defer func() {254if recover() != nil {255ext = ""256err = ErrSyntax257}258}()259260scan := makeScannerString(s)261var end int262if n := len(scan.token); n != 1 {263return "", ErrSyntax264}265scan.toLower(0, len(scan.b))266end = parseExtension(&scan)267if end != len(s) {268return "", ErrSyntax269}270return string(scan.b), nil271}272273// HasVariants reports whether t has variants.274func (t Tag) HasVariants() bool {275return uint16(t.pVariant) < t.pExt276}277278// HasExtensions reports whether t has extensions.279func (t Tag) HasExtensions() bool {280return int(t.pExt) < len(t.str)281}282283// Extension returns the extension of type x for tag t. It will return284// false for ok if t does not have the requested extension. The returned285// extension will be invalid in this case.286func (t Tag) Extension(x byte) (ext string, ok bool) {287for i := int(t.pExt); i < len(t.str)-1; {288var ext string289i, ext = getExtension(t.str, i)290if ext[0] == x {291return ext, true292}293}294return "", false295}296297// Extensions returns all extensions of t.298func (t Tag) Extensions() []string {299e := []string{}300for i := int(t.pExt); i < len(t.str)-1; {301var ext string302i, ext = getExtension(t.str, i)303e = append(e, ext)304}305return e306}307308// TypeForKey returns the type associated with the given key, where key and type309// are of the allowed values defined for the Unicode locale extension ('u') in310// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.311// TypeForKey will traverse the inheritance chain to get the correct value.312//313// If there are multiple types associated with a key, only the first will be314// returned. If there is no type associated with a key, it returns the empty315// string.316func (t Tag) TypeForKey(key string) string {317if _, start, end, _ := t.findTypeForKey(key); end != start {318s := t.str[start:end]319if p := strings.IndexByte(s, '-'); p >= 0 {320s = s[:p]321}322return s323}324return ""325}326327var (328errPrivateUse = errors.New("cannot set a key on a private use tag")329errInvalidArguments = errors.New("invalid key or type")330)331332// SetTypeForKey returns a new Tag with the key set to type, where key and type333// are of the allowed values defined for the Unicode locale extension ('u') in334// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.335// An empty value removes an existing pair with the same key.336func (t Tag) SetTypeForKey(key, value string) (Tag, error) {337if t.IsPrivateUse() {338return t, errPrivateUse339}340if len(key) != 2 {341return t, errInvalidArguments342}343344// Remove the setting if value is "".345if value == "" {346start, sep, end, _ := t.findTypeForKey(key)347if start != sep {348// Remove a possible empty extension.349switch {350case t.str[start-2] != '-': // has previous elements.351case end == len(t.str), // end of string352end+2 < len(t.str) && t.str[end+2] == '-': // end of extension353start -= 2354}355if start == int(t.pVariant) && end == len(t.str) {356t.str = ""357t.pVariant, t.pExt = 0, 0358} else {359t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])360}361}362return t, nil363}364365if len(value) < 3 || len(value) > 8 {366return t, errInvalidArguments367}368369var (370buf [maxCoreSize + maxSimpleUExtensionSize]byte371uStart int // start of the -u extension.372)373374// Generate the tag string if needed.375if t.str == "" {376uStart = t.genCoreBytes(buf[:])377buf[uStart] = '-'378uStart++379}380381// Create new key-type pair and parse it to verify.382b := buf[uStart:]383copy(b, "u-")384copy(b[2:], key)385b[4] = '-'386b = b[:5+copy(b[5:], value)]387scan := makeScanner(b)388if parseExtensions(&scan); scan.err != nil {389return t, scan.err390}391392// Assemble the replacement string.393if t.str == "" {394t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)395t.str = string(buf[:uStart+len(b)])396} else {397s := t.str398start, sep, end, hasExt := t.findTypeForKey(key)399if start == sep {400if hasExt {401b = b[2:]402}403t.str = fmt.Sprintf("%s-%s%s", s[:sep], b, s[end:])404} else {405t.str = fmt.Sprintf("%s-%s%s", s[:start+3], value, s[end:])406}407}408return t, nil409}410411// findTypeForKey returns the start and end position for the type corresponding412// to key or the point at which to insert the key-value pair if the type413// wasn't found. The hasExt return value reports whether an -u extension was present.414// Note: the extensions are typically very small and are likely to contain415// only one key-type pair.416func (t Tag) findTypeForKey(key string) (start, sep, end int, hasExt bool) {417p := int(t.pExt)418if len(key) != 2 || p == len(t.str) || p == 0 {419return p, p, p, false420}421s := t.str422423// Find the correct extension.424for p++; s[p] != 'u'; p++ {425if s[p] > 'u' {426p--427return p, p, p, false428}429if p = nextExtension(s, p); p == len(s) {430return len(s), len(s), len(s), false431}432}433// Proceed to the hyphen following the extension name.434p++435436// curKey is the key currently being processed.437curKey := ""438439// Iterate over keys until we get the end of a section.440for {441end = p442for p++; p < len(s) && s[p] != '-'; p++ {443}444n := p - end - 1445if n <= 2 && curKey == key {446if sep < end {447sep++448}449return start, sep, end, true450}451switch n {452case 0, // invalid string4531: // next extension454return end, end, end, true455case 2:456// next key457curKey = s[end+1 : p]458if curKey > key {459return end, end, end, true460}461start = end462sep = p463}464}465}466467// ParseBase parses a 2- or 3-letter ISO 639 code.468// It returns a ValueError if s is a well-formed but unknown language identifier469// or another error if another error occurred.470func ParseBase(s string) (l Language, err error) {471defer func() {472if recover() != nil {473l = 0474err = ErrSyntax475}476}()477478if n := len(s); n < 2 || 3 < n {479return 0, ErrSyntax480}481var buf [3]byte482return getLangID(buf[:copy(buf[:], s)])483}484485// ParseScript parses a 4-letter ISO 15924 code.486// It returns a ValueError if s is a well-formed but unknown script identifier487// or another error if another error occurred.488func ParseScript(s string) (scr Script, err error) {489defer func() {490if recover() != nil {491scr = 0492err = ErrSyntax493}494}()495496if len(s) != 4 {497return 0, ErrSyntax498}499var buf [4]byte500return getScriptID(script, buf[:copy(buf[:], s)])501}502503// EncodeM49 returns the Region for the given UN M.49 code.504// It returns an error if r is not a valid code.505func EncodeM49(r int) (Region, error) {506return getRegionM49(r)507}508509// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.510// It returns a ValueError if s is a well-formed but unknown region identifier511// or another error if another error occurred.512func ParseRegion(s string) (r Region, err error) {513defer func() {514if recover() != nil {515r = 0516err = ErrSyntax517}518}()519520if n := len(s); n < 2 || 3 < n {521return 0, ErrSyntax522}523var buf [3]byte524return getRegionID(buf[:copy(buf[:], s)])525}526527// IsCountry returns whether this region is a country or autonomous area. This528// includes non-standard definitions from CLDR.529func (r Region) IsCountry() bool {530if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {531return false532}533return true534}535536// IsGroup returns whether this region defines a collection of regions. This537// includes non-standard definitions from CLDR.538func (r Region) IsGroup() bool {539if r == 0 {540return false541}542return int(regionInclusion[r]) < len(regionContainment)543}544545// Contains returns whether Region c is contained by Region r. It returns true546// if c == r.547func (r Region) Contains(c Region) bool {548if r == c {549return true550}551g := regionInclusion[r]552if g >= nRegionGroups {553return false554}555m := regionContainment[g]556557d := regionInclusion[c]558b := regionInclusionBits[d]559560// A contained country may belong to multiple disjoint groups. Matching any561// of these indicates containment. If the contained region is a group, it562// must strictly be a subset.563if d >= nRegionGroups {564return b&m != 0565}566return b&^m == 0567}568569var errNoTLD = errors.New("language: region is not a valid ccTLD")570571// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.572// In all other cases it returns either the region itself or an error.573//574// This method may return an error for a region for which there exists a575// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The576// region will already be canonicalized it was obtained from a Tag that was577// obtained using any of the default methods.578func (r Region) TLD() (Region, error) {579// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the580// difference between ISO 3166-1 and IANA ccTLD.581if r == _GB {582r = _UK583}584if (r.typ() & ccTLD) == 0 {585return 0, errNoTLD586}587return r, nil588}589590// Canonicalize returns the region or a possible replacement if the region is591// deprecated. It will not return a replacement for deprecated regions that592// are split into multiple regions.593func (r Region) Canonicalize() Region {594if cr := normRegion(r); cr != 0 {595return cr596}597return r598}599600// Variant represents a registered variant of a language as defined by BCP 47.601type Variant struct {602ID uint8603str string604}605606// ParseVariant parses and returns a Variant. An error is returned if s is not607// a valid variant.608func ParseVariant(s string) (v Variant, err error) {609defer func() {610if recover() != nil {611v = Variant{}612err = ErrSyntax613}614}()615616s = strings.ToLower(s)617if id, ok := variantIndex[s]; ok {618return Variant{id, s}, nil619}620return Variant{}, NewValueError([]byte(s))621}622623// String returns the string representation of the variant.624func (v Variant) String() string {625return v.str626}627628629