Path: blob/main/vendor/golang.org/x/text/language/language.go
2880 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34//go:generate go run gen.go -output tables.go56package language78// TODO: Remove above NOTE after:9// - verifying that tables are dropped correctly (most notably matcher tables).1011import (12"strings"1314"golang.org/x/text/internal/language"15"golang.org/x/text/internal/language/compact"16)1718// Tag represents a BCP 47 language tag. It is used to specify an instance of a19// specific language or locale. All language tag values are guaranteed to be20// well-formed.21type Tag compact.Tag2223func makeTag(t language.Tag) (tag Tag) {24return Tag(compact.Make(t))25}2627func (t *Tag) tag() language.Tag {28return (*compact.Tag)(t).Tag()29}3031func (t *Tag) isCompact() bool {32return (*compact.Tag)(t).IsCompact()33}3435// TODO: improve performance.36func (t *Tag) lang() language.Language { return t.tag().LangID }37func (t *Tag) region() language.Region { return t.tag().RegionID }38func (t *Tag) script() language.Script { return t.tag().ScriptID }3940// Make is a convenience wrapper for Parse that omits the error.41// In case of an error, a sensible default is returned.42func Make(s string) Tag {43return Default.Make(s)44}4546// Make is a convenience wrapper for c.Parse that omits the error.47// In case of an error, a sensible default is returned.48func (c CanonType) Make(s string) Tag {49t, _ := c.Parse(s)50return t51}5253// Raw returns the raw base language, script and region, without making an54// attempt to infer their values.55func (t Tag) Raw() (b Base, s Script, r Region) {56tt := t.tag()57return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}58}5960// IsRoot returns true if t is equal to language "und".61func (t Tag) IsRoot() bool {62return compact.Tag(t).IsRoot()63}6465// CanonType can be used to enable or disable various types of canonicalization.66type CanonType int6768const (69// Replace deprecated base languages with their preferred replacements.70DeprecatedBase CanonType = 1 << iota71// Replace deprecated scripts with their preferred replacements.72DeprecatedScript73// Replace deprecated regions with their preferred replacements.74DeprecatedRegion75// Remove redundant scripts.76SuppressScript77// Normalize legacy encodings. This includes legacy languages defined in78// CLDR as well as bibliographic codes defined in ISO-639.79Legacy80// Map the dominant language of a macro language group to the macro language81// subtag. For example cmn -> zh.82Macro83// The CLDR flag should be used if full compatibility with CLDR is required.84// There are a few cases where language.Tag may differ from CLDR. To follow all85// of CLDR's suggestions, use All|CLDR.86CLDR8788// Raw can be used to Compose or Parse without Canonicalization.89Raw CanonType = 09091// Replace all deprecated tags with their preferred replacements.92Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion9394// All canonicalizations recommended by BCP 47.95BCP47 = Deprecated | SuppressScript9697// All canonicalizations.98All = BCP47 | Legacy | Macro99100// Default is the canonicalization used by Parse, Make and Compose. To101// preserve as much information as possible, canonicalizations that remove102// potentially valuable information are not included. The Matcher is103// designed to recognize similar tags that would be the same if104// they were canonicalized using All.105Default = Deprecated | Legacy106107canonLang = DeprecatedBase | Legacy | Macro108109// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.110)111112// canonicalize returns the canonicalized equivalent of the tag and113// whether there was any change.114func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {115if c == Raw {116return t, false117}118changed := false119if c&SuppressScript != 0 {120if t.LangID.SuppressScript() == t.ScriptID {121t.ScriptID = 0122changed = true123}124}125if c&canonLang != 0 {126for {127if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {128switch aliasType {129case language.Legacy:130if c&Legacy != 0 {131if t.LangID == _sh && t.ScriptID == 0 {132t.ScriptID = _Latn133}134t.LangID = l135changed = true136}137case language.Macro:138if c&Macro != 0 {139// We deviate here from CLDR. The mapping "nb" -> "no"140// qualifies as a typical Macro language mapping. However,141// for legacy reasons, CLDR maps "no", the macro language142// code for Norwegian, to the dominant variant "nb". This143// change is currently under consideration for CLDR as well.144// See https://unicode.org/cldr/trac/ticket/2698 and also145// https://unicode.org/cldr/trac/ticket/1790 for some of the146// practical implications. TODO: this check could be removed147// if CLDR adopts this change.148if c&CLDR == 0 || t.LangID != _nb {149changed = true150t.LangID = l151}152}153case language.Deprecated:154if c&DeprecatedBase != 0 {155if t.LangID == _mo && t.RegionID == 0 {156t.RegionID = _MD157}158t.LangID = l159changed = true160// Other canonicalization types may still apply.161continue162}163}164} else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {165t.LangID = _nb166changed = true167}168break169}170}171if c&DeprecatedScript != 0 {172if t.ScriptID == _Qaai {173changed = true174t.ScriptID = _Zinh175}176}177if c&DeprecatedRegion != 0 {178if r := t.RegionID.Canonicalize(); r != t.RegionID {179changed = true180t.RegionID = r181}182}183return t, changed184}185186// Canonicalize returns the canonicalized equivalent of the tag.187func (c CanonType) Canonicalize(t Tag) (Tag, error) {188// First try fast path.189if t.isCompact() {190if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {191return t, nil192}193}194// It is unlikely that one will canonicalize a tag after matching. So do195// a slow but simple approach here.196if tag, changed := canonicalize(c, t.tag()); changed {197tag.RemakeString()198return makeTag(tag), nil199}200return t, nil201202}203204// Confidence indicates the level of certainty for a given return value.205// For example, Serbian may be written in Cyrillic or Latin script.206// The confidence level indicates whether a value was explicitly specified,207// whether it is typically the only possible value, or whether there is208// an ambiguity.209type Confidence int210211const (212No Confidence = iota // full confidence that there was no match213Low // most likely value picked out of a set of alternatives214High // value is generally assumed to be the correct match215Exact // exact match or explicitly specified value216)217218var confName = []string{"No", "Low", "High", "Exact"}219220func (c Confidence) String() string {221return confName[c]222}223224// String returns the canonical string representation of the language tag.225func (t Tag) String() string {226return t.tag().String()227}228229// MarshalText implements encoding.TextMarshaler.230func (t Tag) MarshalText() (text []byte, err error) {231return t.tag().MarshalText()232}233234// UnmarshalText implements encoding.TextUnmarshaler.235func (t *Tag) UnmarshalText(text []byte) error {236var tag language.Tag237err := tag.UnmarshalText(text)238*t = makeTag(tag)239return err240}241242// Base returns the base language of the language tag. If the base language is243// unspecified, an attempt will be made to infer it from the context.244// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.245func (t Tag) Base() (Base, Confidence) {246if b := t.lang(); b != 0 {247return Base{b}, Exact248}249tt := t.tag()250c := High251if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {252c = Low253}254if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {255return Base{tag.LangID}, c256}257return Base{0}, No258}259260// Script infers the script for the language tag. If it was not explicitly given, it will infer261// a most likely candidate.262// If more than one script is commonly used for a language, the most likely one263// is returned with a low confidence indication. For example, it returns (Cyrl, Low)264// for Serbian.265// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)266// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks267// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.268// See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for269// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.270// Note that an inferred script is never guaranteed to be the correct one. Latin is271// almost exclusively used for Afrikaans, but Arabic has been used for some texts272// in the past. Also, the script that is commonly used may change over time.273// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.274func (t Tag) Script() (Script, Confidence) {275if scr := t.script(); scr != 0 {276return Script{scr}, Exact277}278tt := t.tag()279sc, c := language.Script(_Zzzz), No280if scr := tt.LangID.SuppressScript(); scr != 0 {281// Note: it is not always the case that a language with a suppress282// script value is only written in one script (e.g. kk, ms, pa).283if tt.RegionID == 0 {284return Script{scr}, High285}286sc, c = scr, High287}288if tag, err := tt.Maximize(); err == nil {289if tag.ScriptID != sc {290sc, c = tag.ScriptID, Low291}292} else {293tt, _ = canonicalize(Deprecated|Macro, tt)294if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {295sc, c = tag.ScriptID, Low296}297}298return Script{sc}, c299}300301// Region returns the region for the language tag. If it was not explicitly given, it will302// infer a most likely candidate from the context.303// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.304func (t Tag) Region() (Region, Confidence) {305if r := t.region(); r != 0 {306return Region{r}, Exact307}308tt := t.tag()309if tt, err := tt.Maximize(); err == nil {310return Region{tt.RegionID}, Low // TODO: differentiate between high and low.311}312tt, _ = canonicalize(Deprecated|Macro, tt)313if tag, err := tt.Maximize(); err == nil {314return Region{tag.RegionID}, Low315}316return Region{_ZZ}, No // TODO: return world instead of undetermined?317}318319// Variants returns the variants specified explicitly for this language tag.320// or nil if no variant was specified.321func (t Tag) Variants() []Variant {322if !compact.Tag(t).MayHaveVariants() {323return nil324}325v := []Variant{}326x, str := "", t.tag().Variants()327for str != "" {328x, str = nextToken(str)329v = append(v, Variant{x})330}331return v332}333334// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a335// specific language are substituted with fields from the parent language.336// The parent for a language may change for newer versions of CLDR.337//338// Parent returns a tag for a less specific language that is mutually339// intelligible or Und if there is no such language. This may not be the same as340// simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"341// is "zh-Hant", and the parent of "zh-Hant" is "und".342func (t Tag) Parent() Tag {343return Tag(compact.Tag(t).Parent())344}345346// nextToken returns token t and the rest of the string.347func nextToken(s string) (t, tail string) {348p := strings.Index(s[1:], "-")349if p == -1 {350return s[1:], ""351}352p++353return s[1:p], s[p:]354}355356// Extension is a single BCP 47 extension.357type Extension struct {358s string359}360361// String returns the string representation of the extension, including the362// type tag.363func (e Extension) String() string {364return e.s365}366367// ParseExtension parses s as an extension and returns it on success.368func ParseExtension(s string) (e Extension, err error) {369ext, err := language.ParseExtension(s)370return Extension{ext}, err371}372373// Type returns the one-byte extension type of e. It returns 0 for the zero374// exception.375func (e Extension) Type() byte {376if e.s == "" {377return 0378}379return e.s[0]380}381382// Tokens returns the list of tokens of e.383func (e Extension) Tokens() []string {384return strings.Split(e.s, "-")385}386387// Extension returns the extension of type x for tag t. It will return388// false for ok if t does not have the requested extension. The returned389// extension will be invalid in this case.390func (t Tag) Extension(x byte) (ext Extension, ok bool) {391if !compact.Tag(t).MayHaveExtensions() {392return Extension{}, false393}394e, ok := t.tag().Extension(x)395return Extension{e}, ok396}397398// Extensions returns all extensions of t.399func (t Tag) Extensions() []Extension {400if !compact.Tag(t).MayHaveExtensions() {401return nil402}403e := []Extension{}404for _, ext := range t.tag().Extensions() {405e = append(e, Extension{ext})406}407return e408}409410// TypeForKey returns the type associated with the given key, where key and type411// are of the allowed values defined for the Unicode locale extension ('u') in412// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.413// TypeForKey will traverse the inheritance chain to get the correct value.414//415// If there are multiple types associated with a key, only the first will be416// returned. If there is no type associated with a key, it returns the empty417// string.418func (t Tag) TypeForKey(key string) string {419if !compact.Tag(t).MayHaveExtensions() {420if key != "rg" && key != "va" {421return ""422}423}424return t.tag().TypeForKey(key)425}426427// SetTypeForKey returns a new Tag with the key set to type, where key and type428// are of the allowed values defined for the Unicode locale extension ('u') in429// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.430// An empty value removes an existing pair with the same key.431func (t Tag) SetTypeForKey(key, value string) (Tag, error) {432tt, err := t.tag().SetTypeForKey(key, value)433return makeTag(tt), err434}435436// NumCompactTags is the number of compact tags. The maximum tag is437// NumCompactTags-1.438const NumCompactTags = compact.NumCompactTags439440// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags441// for which data exists in the text repository.The index will change over time442// and should not be stored in persistent storage. If t does not match a compact443// index, exact will be false and the compact index will be returned for the444// first match after repeatedly taking the Parent of t.445func CompactIndex(t Tag) (index int, exact bool) {446id, exact := compact.LanguageID(compact.Tag(t))447return int(id), exact448}449450var root = language.Tag{}451452// Base is an ISO 639 language code, used for encoding the base language453// of a language tag.454type Base struct {455langID language.Language456}457458// ParseBase parses a 2- or 3-letter ISO 639 code.459// It returns a ValueError if s is a well-formed but unknown language identifier460// or another error if another error occurred.461func ParseBase(s string) (Base, error) {462l, err := language.ParseBase(s)463return Base{l}, err464}465466// String returns the BCP 47 representation of the base language.467func (b Base) String() string {468return b.langID.String()469}470471// ISO3 returns the ISO 639-3 language code.472func (b Base) ISO3() string {473return b.langID.ISO3()474}475476// IsPrivateUse reports whether this language code is reserved for private use.477func (b Base) IsPrivateUse() bool {478return b.langID.IsPrivateUse()479}480481// Script is a 4-letter ISO 15924 code for representing scripts.482// It is idiomatically represented in title case.483type Script struct {484scriptID language.Script485}486487// ParseScript parses a 4-letter ISO 15924 code.488// It returns a ValueError if s is a well-formed but unknown script identifier489// or another error if another error occurred.490func ParseScript(s string) (Script, error) {491sc, err := language.ParseScript(s)492return Script{sc}, err493}494495// String returns the script code in title case.496// It returns "Zzzz" for an unspecified script.497func (s Script) String() string {498return s.scriptID.String()499}500501// IsPrivateUse reports whether this script code is reserved for private use.502func (s Script) IsPrivateUse() bool {503return s.scriptID.IsPrivateUse()504}505506// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.507type Region struct {508regionID language.Region509}510511// EncodeM49 returns the Region for the given UN M.49 code.512// It returns an error if r is not a valid code.513func EncodeM49(r int) (Region, error) {514rid, err := language.EncodeM49(r)515return Region{rid}, err516}517518// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.519// It returns a ValueError if s is a well-formed but unknown region identifier520// or another error if another error occurred.521func ParseRegion(s string) (Region, error) {522r, err := language.ParseRegion(s)523return Region{r}, err524}525526// String returns the BCP 47 representation for the region.527// It returns "ZZ" for an unspecified region.528func (r Region) String() string {529return r.regionID.String()530}531532// ISO3 returns the 3-letter ISO code of r.533// Note that not all regions have a 3-letter ISO code.534// In such cases this method returns "ZZZ".535func (r Region) ISO3() string {536return r.regionID.ISO3()537}538539// M49 returns the UN M.49 encoding of r, or 0 if this encoding540// is not defined for r.541func (r Region) M49() int {542return r.regionID.M49()543}544545// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This546// may include private-use tags that are assigned by CLDR and used in this547// implementation. So IsPrivateUse and IsCountry can be simultaneously true.548func (r Region) IsPrivateUse() bool {549return r.regionID.IsPrivateUse()550}551552// IsCountry returns whether this region is a country or autonomous area. This553// includes non-standard definitions from CLDR.554func (r Region) IsCountry() bool {555return r.regionID.IsCountry()556}557558// IsGroup returns whether this region defines a collection of regions. This559// includes non-standard definitions from CLDR.560func (r Region) IsGroup() bool {561return r.regionID.IsGroup()562}563564// Contains returns whether Region c is contained by Region r. It returns true565// if c == r.566func (r Region) Contains(c Region) bool {567return r.regionID.Contains(c.regionID)568}569570// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.571// In all other cases it returns either the region itself or an error.572//573// This method may return an error for a region for which there exists a574// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The575// region will already be canonicalized it was obtained from a Tag that was576// obtained using any of the default methods.577func (r Region) TLD() (Region, error) {578tld, err := r.regionID.TLD()579return Region{tld}, err580}581582// Canonicalize returns the region or a possible replacement if the region is583// deprecated. It will not return a replacement for deprecated regions that584// are split into multiple regions.585func (r Region) Canonicalize() Region {586return Region{r.regionID.Canonicalize()}587}588589// Variant represents a registered variant of a language as defined by BCP 47.590type Variant struct {591variant string592}593594// ParseVariant parses and returns a Variant. An error is returned if s is not595// a valid variant.596func ParseVariant(s string) (Variant, error) {597v, err := language.ParseVariant(s)598return Variant{v.String()}, err599}600601// String returns the string representation of the variant.602func (v Variant) String() string {603return v.variant604}605606607