Path: blob/main/vendor/golang.org/x/text/encoding/encoding.go
2880 views
// Copyright 2013 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34// Package encoding defines an interface for character encodings, such as Shift5// JIS and Windows 1252, that can convert to and from UTF-8.6//7// Encoding implementations are provided in other packages, such as8// golang.org/x/text/encoding/charmap and9// golang.org/x/text/encoding/japanese.10package encoding // import "golang.org/x/text/encoding"1112import (13"errors"14"io"15"strconv"16"unicode/utf8"1718"golang.org/x/text/encoding/internal/identifier"19"golang.org/x/text/transform"20)2122// TODO:23// - There seems to be some inconsistency in when decoders return errors24// and when not. Also documentation seems to suggest they shouldn't return25// errors at all (except for UTF-16).26// - Encoders seem to rely on or at least benefit from the input being in NFC27// normal form. Perhaps add an example how users could prepare their output.2829// Encoding is a character set encoding that can be transformed to and from30// UTF-8.31type Encoding interface {32// NewDecoder returns a Decoder.33NewDecoder() *Decoder3435// NewEncoder returns an Encoder.36NewEncoder() *Encoder37}3839// A Decoder converts bytes to UTF-8. It implements transform.Transformer.40//41// Transforming source bytes that are not of that encoding will not result in an42// error per se. Each byte that cannot be transcoded will be represented in the43// output by the UTF-8 encoding of '\uFFFD', the replacement rune.44type Decoder struct {45transform.Transformer4647// This forces external creators of Decoders to use names in struct48// initializers, allowing for future extendibility without having to break49// code.50_ struct{}51}5253// Bytes converts the given encoded bytes to UTF-8. It returns the converted54// bytes or nil, err if any error occurred.55func (d *Decoder) Bytes(b []byte) ([]byte, error) {56b, _, err := transform.Bytes(d, b)57if err != nil {58return nil, err59}60return b, nil61}6263// String converts the given encoded string to UTF-8. It returns the converted64// string or "", err if any error occurred.65func (d *Decoder) String(s string) (string, error) {66s, _, err := transform.String(d, s)67if err != nil {68return "", err69}70return s, nil71}7273// Reader wraps another Reader to decode its bytes.74//75// The Decoder may not be used for any other operation as long as the returned76// Reader is in use.77func (d *Decoder) Reader(r io.Reader) io.Reader {78return transform.NewReader(r, d)79}8081// An Encoder converts bytes from UTF-8. It implements transform.Transformer.82//83// Each rune that cannot be transcoded will result in an error. In this case,84// the transform will consume all source byte up to, not including the offending85// rune. Transforming source bytes that are not valid UTF-8 will be replaced by86// `\uFFFD`. To return early with an error instead, use transform.Chain to87// preprocess the data with a UTF8Validator.88type Encoder struct {89transform.Transformer9091// This forces external creators of Encoders to use names in struct92// initializers, allowing for future extendibility without having to break93// code.94_ struct{}95}9697// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if98// any error occurred.99func (e *Encoder) Bytes(b []byte) ([]byte, error) {100b, _, err := transform.Bytes(e, b)101if err != nil {102return nil, err103}104return b, nil105}106107// String converts a string from UTF-8. It returns the converted string or108// "", err if any error occurred.109func (e *Encoder) String(s string) (string, error) {110s, _, err := transform.String(e, s)111if err != nil {112return "", err113}114return s, nil115}116117// Writer wraps another Writer to encode its UTF-8 output.118//119// The Encoder may not be used for any other operation as long as the returned120// Writer is in use.121func (e *Encoder) Writer(w io.Writer) io.Writer {122return transform.NewWriter(w, e)123}124125// ASCIISub is the ASCII substitute character, as recommended by126// https://unicode.org/reports/tr36/#Text_Comparison127const ASCIISub = '\x1a'128129// Nop is the nop encoding. Its transformed bytes are the same as the source130// bytes; it does not replace invalid UTF-8 sequences.131var Nop Encoding = nop{}132133type nop struct{}134135func (nop) NewDecoder() *Decoder {136return &Decoder{Transformer: transform.Nop}137}138func (nop) NewEncoder() *Encoder {139return &Encoder{Transformer: transform.Nop}140}141142// Replacement is the replacement encoding. Decoding from the replacement143// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to144// the replacement encoding yields the same as the source bytes except that145// invalid UTF-8 is converted to '\uFFFD'.146//147// It is defined at http://encoding.spec.whatwg.org/#replacement148var Replacement Encoding = replacement{}149150type replacement struct{}151152func (replacement) NewDecoder() *Decoder {153return &Decoder{Transformer: replacementDecoder{}}154}155156func (replacement) NewEncoder() *Encoder {157return &Encoder{Transformer: replacementEncoder{}}158}159160func (replacement) ID() (mib identifier.MIB, other string) {161return identifier.Replacement, ""162}163164type replacementDecoder struct{ transform.NopResetter }165166func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {167if len(dst) < 3 {168return 0, 0, transform.ErrShortDst169}170if atEOF {171const fffd = "\ufffd"172dst[0] = fffd[0]173dst[1] = fffd[1]174dst[2] = fffd[2]175nDst = 3176}177return nDst, len(src), nil178}179180type replacementEncoder struct{ transform.NopResetter }181182func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {183r, size := rune(0), 0184185for ; nSrc < len(src); nSrc += size {186r = rune(src[nSrc])187188// Decode a 1-byte rune.189if r < utf8.RuneSelf {190size = 1191192} else {193// Decode a multi-byte rune.194r, size = utf8.DecodeRune(src[nSrc:])195if size == 1 {196// All valid runes of size 1 (those below utf8.RuneSelf) were197// handled above. We have invalid UTF-8 or we haven't seen the198// full character yet.199if !atEOF && !utf8.FullRune(src[nSrc:]) {200err = transform.ErrShortSrc201break202}203r = '\ufffd'204}205}206207if nDst+utf8.RuneLen(r) > len(dst) {208err = transform.ErrShortDst209break210}211nDst += utf8.EncodeRune(dst[nDst:], r)212}213return nDst, nSrc, err214}215216// HTMLEscapeUnsupported wraps encoders to replace source runes outside the217// repertoire of the destination encoding with HTML escape sequences.218//219// This wrapper exists to comply to URL and HTML forms requiring a220// non-terminating legacy encoder. The produced sequences may lead to data221// loss as they are indistinguishable from legitimate input. To avoid this222// issue, use UTF-8 encodings whenever possible.223func HTMLEscapeUnsupported(e *Encoder) *Encoder {224return &Encoder{Transformer: &errorHandler{e, errorToHTML}}225}226227// ReplaceUnsupported wraps encoders to replace source runes outside the228// repertoire of the destination encoding with an encoding-specific229// replacement.230//231// This wrapper is only provided for backwards compatibility and legacy232// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.233func ReplaceUnsupported(e *Encoder) *Encoder {234return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}235}236237type errorHandler struct {238*Encoder239handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)240}241242// TODO: consider making this error public in some form.243type repertoireError interface {244Replacement() byte245}246247func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {248nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)249for err != nil {250rerr, ok := err.(repertoireError)251if !ok {252return nDst, nSrc, err253}254r, sz := utf8.DecodeRune(src[nSrc:])255n, ok := h.handler(dst[nDst:], r, rerr)256if !ok {257return nDst, nSrc, transform.ErrShortDst258}259err = nil260nDst += n261if nSrc += sz; nSrc < len(src) {262var dn, sn int263dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)264nDst += dn265nSrc += sn266}267}268return nDst, nSrc, err269}270271func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {272buf := [8]byte{}273b := strconv.AppendUint(buf[:0], uint64(r), 10)274if n = len(b) + len("&#;"); n >= len(dst) {275return 0, false276}277dst[0] = '&'278dst[1] = '#'279dst[copy(dst[2:], b)+2] = ';'280return n, true281}282283func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {284if len(dst) == 0 {285return 0, false286}287dst[0] = err.Replacement()288return 1, true289}290291// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.292var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")293294// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first295// input byte that is not valid UTF-8.296var UTF8Validator transform.Transformer = utf8Validator{}297298type utf8Validator struct{ transform.NopResetter }299300func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {301n := len(src)302if n > len(dst) {303n = len(dst)304}305for i := 0; i < n; {306if c := src[i]; c < utf8.RuneSelf {307dst[i] = c308i++309continue310}311_, size := utf8.DecodeRune(src[i:])312if size == 1 {313// All valid runes of size 1 (those below utf8.RuneSelf) were314// handled above. We have invalid UTF-8 or we haven't seen the315// full character yet.316err = ErrInvalidUTF8317if !atEOF && !utf8.FullRune(src[i:]) {318err = transform.ErrShortSrc319}320return i, i, err321}322if i+size > len(dst) {323return i, i, transform.ErrShortDst324}325for ; size > 0; size-- {326dst[i] = src[i]327i++328}329}330if len(src) > len(dst) {331err = transform.ErrShortDst332}333return n, n, err334}335336337