Path: blob/main/vendor/golang.org/x/text/internal/language/parse.go
2893 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package language56import (7"bytes"8"errors"9"fmt"10"sort"1112"golang.org/x/text/internal/tag"13)1415// isAlpha returns true if the byte is not a digit.16// b must be an ASCII letter or digit.17func isAlpha(b byte) bool {18return b > '9'19}2021// isAlphaNum returns true if the string contains only ASCII letters or digits.22func isAlphaNum(s []byte) bool {23for _, c := range s {24if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {25return false26}27}28return true29}3031// ErrSyntax is returned by any of the parsing functions when the32// input is not well-formed, according to BCP 47.33// TODO: return the position at which the syntax error occurred?34var ErrSyntax = errors.New("language: tag is not well-formed")3536// ErrDuplicateKey is returned when a tag contains the same key twice with37// different values in the -u section.38var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")3940// ValueError is returned by any of the parsing functions when the41// input is well-formed but the respective subtag is not recognized42// as a valid value.43type ValueError struct {44v [8]byte45}4647// NewValueError creates a new ValueError.48func NewValueError(tag []byte) ValueError {49var e ValueError50copy(e.v[:], tag)51return e52}5354func (e ValueError) tag() []byte {55n := bytes.IndexByte(e.v[:], 0)56if n == -1 {57n = 858}59return e.v[:n]60}6162// Error implements the error interface.63func (e ValueError) Error() string {64return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())65}6667// Subtag returns the subtag for which the error occurred.68func (e ValueError) Subtag() string {69return string(e.tag())70}7172// scanner is used to scan BCP 47 tokens, which are separated by _ or -.73type scanner struct {74b []byte75bytes [max99thPercentileSize]byte76token []byte77start int // start position of the current token78end int // end position of the current token79next int // next point for scan80err error81done bool82}8384func makeScannerString(s string) scanner {85scan := scanner{}86if len(s) <= len(scan.bytes) {87scan.b = scan.bytes[:copy(scan.bytes[:], s)]88} else {89scan.b = []byte(s)90}91scan.init()92return scan93}9495// makeScanner returns a scanner using b as the input buffer.96// b is not copied and may be modified by the scanner routines.97func makeScanner(b []byte) scanner {98scan := scanner{b: b}99scan.init()100return scan101}102103func (s *scanner) init() {104for i, c := range s.b {105if c == '_' {106s.b[i] = '-'107}108}109s.scan()110}111112// restToLower converts the string between start and end to lower case.113func (s *scanner) toLower(start, end int) {114for i := start; i < end; i++ {115c := s.b[i]116if 'A' <= c && c <= 'Z' {117s.b[i] += 'a' - 'A'118}119}120}121122func (s *scanner) setError(e error) {123if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {124s.err = e125}126}127128// resizeRange shrinks or grows the array at position oldStart such that129// a new string of size newSize can fit between oldStart and oldEnd.130// Sets the scan point to after the resized range.131func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {132s.start = oldStart133if end := oldStart + newSize; end != oldEnd {134diff := end - oldEnd135var b []byte136if n := len(s.b) + diff; n > cap(s.b) {137b = make([]byte, n)138copy(b, s.b[:oldStart])139} else {140b = s.b[:n]141}142copy(b[end:], s.b[oldEnd:])143s.b = b144s.next = end + (s.next - s.end)145s.end = end146}147}148149// replace replaces the current token with repl.150func (s *scanner) replace(repl string) {151s.resizeRange(s.start, s.end, len(repl))152copy(s.b[s.start:], repl)153}154155// gobble removes the current token from the input.156// Caller must call scan after calling gobble.157func (s *scanner) gobble(e error) {158s.setError(e)159if s.start == 0 {160s.b = s.b[:+copy(s.b, s.b[s.next:])]161s.end = 0162} else {163s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]164s.end = s.start - 1165}166s.next = s.start167}168169// deleteRange removes the given range from s.b before the current token.170func (s *scanner) deleteRange(start, end int) {171s.b = s.b[:start+copy(s.b[start:], s.b[end:])]172diff := end - start173s.next -= diff174s.start -= diff175s.end -= diff176}177178// scan parses the next token of a BCP 47 string. Tokens that are larger179// than 8 characters or include non-alphanumeric characters result in an error180// and are gobbled and removed from the output.181// It returns the end position of the last token consumed.182func (s *scanner) scan() (end int) {183end = s.end184s.token = nil185for s.start = s.next; s.next < len(s.b); {186i := bytes.IndexByte(s.b[s.next:], '-')187if i == -1 {188s.end = len(s.b)189s.next = len(s.b)190i = s.end - s.start191} else {192s.end = s.next + i193s.next = s.end + 1194}195token := s.b[s.start:s.end]196if i < 1 || i > 8 || !isAlphaNum(token) {197s.gobble(ErrSyntax)198continue199}200s.token = token201return end202}203if n := len(s.b); n > 0 && s.b[n-1] == '-' {204s.setError(ErrSyntax)205s.b = s.b[:len(s.b)-1]206}207s.done = true208return end209}210211// acceptMinSize parses multiple tokens of the given size or greater.212// It returns the end position of the last token consumed.213func (s *scanner) acceptMinSize(min int) (end int) {214end = s.end215s.scan()216for ; len(s.token) >= min; s.scan() {217end = s.end218}219return end220}221222// Parse parses the given BCP 47 string and returns a valid Tag. If parsing223// failed it returns an error and any part of the tag that could be parsed.224// If parsing succeeded but an unknown value was found, it returns225// ValueError. The Tag returned in this case is just stripped of the unknown226// value. All other values are preserved. It accepts tags in the BCP 47 format227// and extensions to this standard defined in228// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.229func Parse(s string) (t Tag, err error) {230// TODO: consider supporting old-style locale key-value pairs.231if s == "" {232return Und, ErrSyntax233}234defer func() {235if recover() != nil {236t = Und237err = ErrSyntax238return239}240}()241if len(s) <= maxAltTaglen {242b := [maxAltTaglen]byte{}243for i, c := range s {244// Generating invalid UTF-8 is okay as it won't match.245if 'A' <= c && c <= 'Z' {246c += 'a' - 'A'247} else if c == '_' {248c = '-'249}250b[i] = byte(c)251}252if t, ok := grandfathered(b); ok {253return t, nil254}255}256scan := makeScannerString(s)257return parse(&scan, s)258}259260func parse(scan *scanner, s string) (t Tag, err error) {261t = Und262var end int263if n := len(scan.token); n <= 1 {264scan.toLower(0, len(scan.b))265if n == 0 || scan.token[0] != 'x' {266return t, ErrSyntax267}268end = parseExtensions(scan)269} else if n >= 4 {270return Und, ErrSyntax271} else { // the usual case272t, end = parseTag(scan, true)273if n := len(scan.token); n == 1 {274t.pExt = uint16(end)275end = parseExtensions(scan)276} else if end < len(scan.b) {277scan.setError(ErrSyntax)278scan.b = scan.b[:end]279}280}281if int(t.pVariant) < len(scan.b) {282if end < len(s) {283s = s[:end]284}285if len(s) > 0 && tag.Compare(s, scan.b) == 0 {286t.str = s287} else {288t.str = string(scan.b)289}290} else {291t.pVariant, t.pExt = 0, 0292}293return t, scan.err294}295296// parseTag parses language, script, region and variants.297// It returns a Tag and the end position in the input that was parsed.298// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.299func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {300var e error301// TODO: set an error if an unknown lang, script or region is encountered.302t.LangID, e = getLangID(scan.token)303scan.setError(e)304scan.replace(t.LangID.String())305langStart := scan.start306end = scan.scan()307for len(scan.token) == 3 && isAlpha(scan.token[0]) {308// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent309// to a tag of the form <extlang>.310if doNorm {311lang, e := getLangID(scan.token)312if lang != 0 {313t.LangID = lang314langStr := lang.String()315copy(scan.b[langStart:], langStr)316scan.b[langStart+len(langStr)] = '-'317scan.start = langStart + len(langStr) + 1318}319scan.gobble(e)320}321end = scan.scan()322}323if len(scan.token) == 4 && isAlpha(scan.token[0]) {324t.ScriptID, e = getScriptID(script, scan.token)325if t.ScriptID == 0 {326scan.gobble(e)327}328end = scan.scan()329}330if n := len(scan.token); n >= 2 && n <= 3 {331t.RegionID, e = getRegionID(scan.token)332if t.RegionID == 0 {333scan.gobble(e)334} else {335scan.replace(t.RegionID.String())336}337end = scan.scan()338}339scan.toLower(scan.start, len(scan.b))340t.pVariant = byte(end)341end = parseVariants(scan, end, t)342t.pExt = uint16(end)343return t, end344}345346var separator = []byte{'-'}347348// parseVariants scans tokens as long as each token is a valid variant string.349// Duplicate variants are removed.350func parseVariants(scan *scanner, end int, t Tag) int {351start := scan.start352varIDBuf := [4]uint8{}353variantBuf := [4][]byte{}354varID := varIDBuf[:0]355variant := variantBuf[:0]356last := -1357needSort := false358for ; len(scan.token) >= 4; scan.scan() {359// TODO: measure the impact of needing this conversion and redesign360// the data structure if there is an issue.361v, ok := variantIndex[string(scan.token)]362if !ok {363// unknown variant364// TODO: allow user-defined variants?365scan.gobble(NewValueError(scan.token))366continue367}368varID = append(varID, v)369variant = append(variant, scan.token)370if !needSort {371if last < int(v) {372last = int(v)373} else {374needSort = true375// There is no legal combinations of more than 7 variants376// (and this is by no means a useful sequence).377const maxVariants = 8378if len(varID) > maxVariants {379break380}381}382}383end = scan.end384}385if needSort {386sort.Sort(variantsSort{varID, variant})387k, l := 0, -1388for i, v := range varID {389w := int(v)390if l == w {391// Remove duplicates.392continue393}394varID[k] = varID[i]395variant[k] = variant[i]396k++397l = w398}399if str := bytes.Join(variant[:k], separator); len(str) == 0 {400end = start - 1401} else {402scan.resizeRange(start, end, len(str))403copy(scan.b[scan.start:], str)404end = scan.end405}406}407return end408}409410type variantsSort struct {411i []uint8412v [][]byte413}414415func (s variantsSort) Len() int {416return len(s.i)417}418419func (s variantsSort) Swap(i, j int) {420s.i[i], s.i[j] = s.i[j], s.i[i]421s.v[i], s.v[j] = s.v[j], s.v[i]422}423424func (s variantsSort) Less(i, j int) bool {425return s.i[i] < s.i[j]426}427428type bytesSort struct {429b [][]byte430n int // first n bytes to compare431}432433func (b bytesSort) Len() int {434return len(b.b)435}436437func (b bytesSort) Swap(i, j int) {438b.b[i], b.b[j] = b.b[j], b.b[i]439}440441func (b bytesSort) Less(i, j int) bool {442for k := 0; k < b.n; k++ {443if b.b[i][k] == b.b[j][k] {444continue445}446return b.b[i][k] < b.b[j][k]447}448return false449}450451// parseExtensions parses and normalizes the extensions in the buffer.452// It returns the last position of scan.b that is part of any extension.453// It also trims scan.b to remove excess parts accordingly.454func parseExtensions(scan *scanner) int {455start := scan.start456exts := [][]byte{}457private := []byte{}458end := scan.end459for len(scan.token) == 1 {460extStart := scan.start461ext := scan.token[0]462end = parseExtension(scan)463extension := scan.b[extStart:end]464if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {465scan.setError(ErrSyntax)466end = extStart467continue468} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {469scan.b = scan.b[:end]470return end471} else if ext == 'x' {472private = extension473break474}475exts = append(exts, extension)476}477sort.Sort(bytesSort{exts, 1})478if len(private) > 0 {479exts = append(exts, private)480}481scan.b = scan.b[:start]482if len(exts) > 0 {483scan.b = append(scan.b, bytes.Join(exts, separator)...)484} else if start > 0 {485// Strip trailing '-'.486scan.b = scan.b[:start-1]487}488return end489}490491// parseExtension parses a single extension and returns the position of492// the extension end.493func parseExtension(scan *scanner) int {494start, end := scan.start, scan.end495switch scan.token[0] {496case 'u': // https://www.ietf.org/rfc/rfc6067.txt497attrStart := end498scan.scan()499for last := []byte{}; len(scan.token) > 2; scan.scan() {500if bytes.Compare(scan.token, last) != -1 {501// Attributes are unsorted. Start over from scratch.502p := attrStart + 1503scan.next = p504attrs := [][]byte{}505for scan.scan(); len(scan.token) > 2; scan.scan() {506attrs = append(attrs, scan.token)507end = scan.end508}509sort.Sort(bytesSort{attrs, 3})510copy(scan.b[p:], bytes.Join(attrs, separator))511break512}513last = scan.token514end = scan.end515}516// Scan key-type sequences. A key is of length 2 and may be followed517// by 0 or more "type" subtags from 3 to the maximum of 8 letters.518var last, key []byte519for attrEnd := end; len(scan.token) == 2; last = key {520key = scan.token521end = scan.end522for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {523end = scan.end524}525// TODO: check key value validity526if bytes.Compare(key, last) != 1 || scan.err != nil {527// We have an invalid key or the keys are not sorted.528// Start scanning keys from scratch and reorder.529p := attrEnd + 1530scan.next = p531keys := [][]byte{}532for scan.scan(); len(scan.token) == 2; {533keyStart := scan.start534end = scan.end535for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {536end = scan.end537}538keys = append(keys, scan.b[keyStart:end])539}540sort.Stable(bytesSort{keys, 2})541if n := len(keys); n > 0 {542k := 0543for i := 1; i < n; i++ {544if !bytes.Equal(keys[k][:2], keys[i][:2]) {545k++546keys[k] = keys[i]547} else if !bytes.Equal(keys[k], keys[i]) {548scan.setError(ErrDuplicateKey)549}550}551keys = keys[:k+1]552}553reordered := bytes.Join(keys, separator)554if e := p + len(reordered); e < end {555scan.deleteRange(e, end)556end = e557}558copy(scan.b[p:], reordered)559break560}561}562case 't': // https://www.ietf.org/rfc/rfc6497.txt563scan.scan()564if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {565_, end = parseTag(scan, false)566scan.toLower(start, end)567}568for len(scan.token) == 2 && !isAlpha(scan.token[1]) {569end = scan.acceptMinSize(3)570}571case 'x':572end = scan.acceptMinSize(1)573default:574end = scan.acceptMinSize(2)575}576return end577}578579// getExtension returns the name, body and end position of the extension.580func getExtension(s string, p int) (end int, ext string) {581if s[p] == '-' {582p++583}584if s[p] == 'x' {585return len(s), s[p:]586}587end = nextExtension(s, p)588return end, s[p:end]589}590591// nextExtension finds the next extension within the string, searching592// for the -<char>- pattern from position p.593// In the fast majority of cases, language tags will have at most594// one extension and extensions tend to be small.595func nextExtension(s string, p int) int {596for n := len(s) - 3; p < n; {597if s[p] == '-' {598if s[p+2] == '-' {599return p600}601p += 3602} else {603p++604}605}606return len(s)607}608609610