Path: blob/main/vendor/golang.org/x/text/encoding/charmap/charmap.go
2893 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34//go:generate go run maketables.go56// Package charmap provides simple character encodings such as IBM Code Page 4377// and Windows 1252.8package charmap // import "golang.org/x/text/encoding/charmap"910import (11"unicode/utf8"1213"golang.org/x/text/encoding"14"golang.org/x/text/encoding/internal"15"golang.org/x/text/encoding/internal/identifier"16"golang.org/x/text/transform"17)1819// These encodings vary only in the way clients should interpret them. Their20// coded character set is identical and a single implementation can be shared.21var (22// ISO8859_6E is the ISO 8859-6E encoding.23ISO8859_6E encoding.Encoding = &iso8859_6E2425// ISO8859_6I is the ISO 8859-6I encoding.26ISO8859_6I encoding.Encoding = &iso8859_6I2728// ISO8859_8E is the ISO 8859-8E encoding.29ISO8859_8E encoding.Encoding = &iso8859_8E3031// ISO8859_8I is the ISO 8859-8I encoding.32ISO8859_8I encoding.Encoding = &iso8859_8I3334iso8859_6E = internal.Encoding{35Encoding: ISO8859_6,36Name: "ISO-8859-6E",37MIB: identifier.ISO88596E,38}3940iso8859_6I = internal.Encoding{41Encoding: ISO8859_6,42Name: "ISO-8859-6I",43MIB: identifier.ISO88596I,44}4546iso8859_8E = internal.Encoding{47Encoding: ISO8859_8,48Name: "ISO-8859-8E",49MIB: identifier.ISO88598E,50}5152iso8859_8I = internal.Encoding{53Encoding: ISO8859_8,54Name: "ISO-8859-8I",55MIB: identifier.ISO88598I,56}57)5859// All is a list of all defined encodings in this package.60var All []encoding.Encoding = listAll6162// TODO: implement these encodings, in order of importance.63// ASCII, ISO8859_1: Rather common. Close to Windows 1252.64// ISO8859_9: Close to Windows 1254.6566// utf8Enc holds a rune's UTF-8 encoding in data[:len].67type utf8Enc struct {68len uint869data [3]byte70}7172// Charmap is an 8-bit character set encoding.73type Charmap struct {74// name is the encoding's name.75name string76// mib is the encoding type of this encoder.77mib identifier.MIB78// asciiSuperset states whether the encoding is a superset of ASCII.79asciiSuperset bool80// low is the lower bound of the encoded byte for a non-ASCII rune. If81// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.82low uint883// replacement is the encoded replacement character.84replacement byte85// decode is the map from encoded byte to UTF-8.86decode [256]utf8Enc87// encoding is the map from runes to encoded bytes. Each entry is a88// uint32: the high 8 bits are the encoded byte and the low 24 bits are89// the rune. The table entries are sorted by ascending rune.90encode [256]uint3291}9293// NewDecoder implements the encoding.Encoding interface.94func (m *Charmap) NewDecoder() *encoding.Decoder {95return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}96}9798// NewEncoder implements the encoding.Encoding interface.99func (m *Charmap) NewEncoder() *encoding.Encoder {100return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}101}102103// String returns the Charmap's name.104func (m *Charmap) String() string {105return m.name106}107108// ID implements an internal interface.109func (m *Charmap) ID() (mib identifier.MIB, other string) {110return m.mib, ""111}112113// charmapDecoder implements transform.Transformer by decoding to UTF-8.114type charmapDecoder struct {115transform.NopResetter116charmap *Charmap117}118119func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {120for i, c := range src {121if m.charmap.asciiSuperset && c < utf8.RuneSelf {122if nDst >= len(dst) {123err = transform.ErrShortDst124break125}126dst[nDst] = c127nDst++128nSrc = i + 1129continue130}131132decode := &m.charmap.decode[c]133n := int(decode.len)134if nDst+n > len(dst) {135err = transform.ErrShortDst136break137}138// It's 15% faster to avoid calling copy for these tiny slices.139for j := 0; j < n; j++ {140dst[nDst] = decode.data[j]141nDst++142}143nSrc = i + 1144}145return nDst, nSrc, err146}147148// DecodeByte returns the Charmap's rune decoding of the byte b.149func (m *Charmap) DecodeByte(b byte) rune {150switch x := &m.decode[b]; x.len {151case 1:152return rune(x.data[0])153case 2:154return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)155default:156return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)157}158}159160// charmapEncoder implements transform.Transformer by encoding from UTF-8.161type charmapEncoder struct {162transform.NopResetter163charmap *Charmap164}165166func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {167r, size := rune(0), 0168loop:169for nSrc < len(src) {170if nDst >= len(dst) {171err = transform.ErrShortDst172break173}174r = rune(src[nSrc])175176// Decode a 1-byte rune.177if r < utf8.RuneSelf {178if m.charmap.asciiSuperset {179nSrc++180dst[nDst] = uint8(r)181nDst++182continue183}184size = 1185186} else {187// Decode a multi-byte rune.188r, size = utf8.DecodeRune(src[nSrc:])189if size == 1 {190// All valid runes of size 1 (those below utf8.RuneSelf) were191// handled above. We have invalid UTF-8 or we haven't seen the192// full character yet.193if !atEOF && !utf8.FullRune(src[nSrc:]) {194err = transform.ErrShortSrc195} else {196err = internal.RepertoireError(m.charmap.replacement)197}198break199}200}201202// Binary search in [low, high) for that rune in the m.charmap.encode table.203for low, high := int(m.charmap.low), 0x100; ; {204if low >= high {205err = internal.RepertoireError(m.charmap.replacement)206break loop207}208mid := (low + high) / 2209got := m.charmap.encode[mid]210gotRune := rune(got & (1<<24 - 1))211if gotRune < r {212low = mid + 1213} else if gotRune > r {214high = mid215} else {216dst[nDst] = byte(got >> 24)217nDst++218break219}220}221nSrc += size222}223return nDst, nSrc, err224}225226// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether227// r is in the Charmap's repertoire. If not, b is set to the Charmap's228// replacement byte. This is often the ASCII substitute character '\x1a'.229func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {230if r < utf8.RuneSelf && m.asciiSuperset {231return byte(r), true232}233for low, high := int(m.low), 0x100; ; {234if low >= high {235return m.replacement, false236}237mid := (low + high) / 2238got := m.encode[mid]239gotRune := rune(got & (1<<24 - 1))240if gotRune < r {241low = mid + 1242} else if gotRune > r {243high = mid244} else {245return byte(got >> 24), true246}247}248}249250251