Path: blob/main/vendor/golang.org/x/text/encoding/simplifiedchinese/gbk.go
2893 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package simplifiedchinese56import (7"unicode/utf8"89"golang.org/x/text/encoding"10"golang.org/x/text/encoding/internal"11"golang.org/x/text/encoding/internal/identifier"12"golang.org/x/text/transform"13)1415var (16// GB18030 is the GB18030 encoding.17GB18030 encoding.Encoding = &gbk1803018// GBK is the GBK encoding. It encodes an extension of the GB2312 character set19// and is also known as Code Page 936.20GBK encoding.Encoding = &gbk21)2223var gbk = internal.Encoding{24&internal.SimpleEncoding{25gbkDecoder{gb18030: false},26gbkEncoder{gb18030: false},27},28"GBK",29identifier.GBK,30}3132var gbk18030 = internal.Encoding{33&internal.SimpleEncoding{34gbkDecoder{gb18030: true},35gbkEncoder{gb18030: true},36},37"GB18030",38identifier.GB18030,39}4041type gbkDecoder struct {42transform.NopResetter43gb18030 bool44}4546func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {47r, size := rune(0), 048loop:49for ; nSrc < len(src); nSrc += size {50switch c0 := src[nSrc]; {51case c0 < utf8.RuneSelf:52r, size = rune(c0), 15354// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC55// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk56// says to treat "gbk" as Code Page 936.57// GBK’s decoder is gb18030’s decoder. https://encoding.spec.whatwg.org/#gbk-decoder58// If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder59case c0 == 0x80:60r, size = '€', 16162case c0 < 0xff:63if nSrc+1 >= len(src) {64if !atEOF {65err = transform.ErrShortSrc66break loop67}68r, size = utf8.RuneError, 169goto write70}71c1 := src[nSrc+1]72switch {73case 0x40 <= c1 && c1 < 0x7f:74c1 -= 0x4075case 0x80 <= c1 && c1 < 0xff:76c1 -= 0x4177case d.gb18030 && 0x30 <= c1 && c1 < 0x40:78if nSrc+3 >= len(src) {79if !atEOF {80err = transform.ErrShortSrc81break loop82}83// The second byte here is always ASCII, so we can set size84// to 1 in all cases.85r, size = utf8.RuneError, 186goto write87}88c2 := src[nSrc+2]89if c2 < 0x81 || 0xff <= c2 {90r, size = utf8.RuneError, 191goto write92}93c3 := src[nSrc+3]94if c3 < 0x30 || 0x3a <= c3 {95r, size = utf8.RuneError, 196goto write97}98size = 499r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)100if r < 39420 {101i, j := 0, len(gb18030)102for i < j {103h := i + (j-i)/2104if r >= rune(gb18030[h][0]) {105i = h + 1106} else {107j = h108}109}110dec := &gb18030[i-1]111r += rune(dec[1]) - rune(dec[0])112goto write113}114r -= 189000115if 0 <= r && r < 0x100000 {116r += 0x10000117} else {118r, size = utf8.RuneError, 1119}120goto write121default:122r, size = utf8.RuneError, 1123goto write124}125r, size = '\ufffd', 2126if i := int(c0-0x81)*190 + int(c1); i < len(decode) {127r = rune(decode[i])128if r == 0 {129r = '\ufffd'130}131}132133default:134r, size = utf8.RuneError, 1135}136137write:138if nDst+utf8.RuneLen(r) > len(dst) {139err = transform.ErrShortDst140break loop141}142nDst += utf8.EncodeRune(dst[nDst:], r)143}144return nDst, nSrc, err145}146147type gbkEncoder struct {148transform.NopResetter149gb18030 bool150}151152func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {153r, r2, size := rune(0), rune(0), 0154for ; nSrc < len(src); nSrc += size {155r = rune(src[nSrc])156157// Decode a 1-byte rune.158if r < utf8.RuneSelf {159size = 1160161} else {162// Decode a multi-byte rune.163r, size = utf8.DecodeRune(src[nSrc:])164if size == 1 {165// All valid runes of size 1 (those below utf8.RuneSelf) were166// handled above. We have invalid UTF-8 or we haven't seen the167// full character yet.168if !atEOF && !utf8.FullRune(src[nSrc:]) {169err = transform.ErrShortSrc170break171}172}173174// func init checks that the switch covers all tables.175switch {176case encode0Low <= r && r < encode0High:177if r2 = rune(encode0[r-encode0Low]); r2 != 0 {178goto write2179}180case encode1Low <= r && r < encode1High:181// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC182// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk183// says to treat "gbk" as Code Page 936.184// GBK’s encoder is gb18030’s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder185// If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder186if !e.gb18030 && r == '€' {187r = 0x80188goto write1189}190if r2 = rune(encode1[r-encode1Low]); r2 != 0 {191goto write2192}193case encode2Low <= r && r < encode2High:194if r2 = rune(encode2[r-encode2Low]); r2 != 0 {195goto write2196}197case encode3Low <= r && r < encode3High:198if r2 = rune(encode3[r-encode3Low]); r2 != 0 {199goto write2200}201case encode4Low <= r && r < encode4High:202if r2 = rune(encode4[r-encode4Low]); r2 != 0 {203goto write2204}205}206207if e.gb18030 {208if r < 0x10000 {209i, j := 0, len(gb18030)210for i < j {211h := i + (j-i)/2212if r >= rune(gb18030[h][1]) {213i = h + 1214} else {215j = h216}217}218dec := &gb18030[i-1]219r += rune(dec[0]) - rune(dec[1])220goto write4221} else if r < 0x110000 {222r += 189000 - 0x10000223goto write4224}225}226err = internal.ErrASCIIReplacement227break228}229230write1:231if nDst >= len(dst) {232err = transform.ErrShortDst233break234}235dst[nDst] = uint8(r)236nDst++237continue238239write2:240if nDst+2 > len(dst) {241err = transform.ErrShortDst242break243}244dst[nDst+0] = uint8(r2 >> 8)245dst[nDst+1] = uint8(r2)246nDst += 2247continue248249write4:250if nDst+4 > len(dst) {251err = transform.ErrShortDst252break253}254dst[nDst+3] = uint8(r%10 + 0x30)255r /= 10256dst[nDst+2] = uint8(r%126 + 0x81)257r /= 126258dst[nDst+1] = uint8(r%10 + 0x30)259r /= 10260dst[nDst+0] = uint8(r + 0x81)261nDst += 4262continue263}264return nDst, nSrc, err265}266267func init() {268// Check that the hard-coded encode switch covers all tables.269if numEncodeTables != 5 {270panic("bad numEncodeTables")271}272}273274275