Path: blob/main/vendor/golang.org/x/net/html/escape.go
2880 views
// Copyright 2010 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package html56import (7"bytes"8"strings"9"unicode/utf8"10)1112// These replacements permit compatibility with old numeric entities that13// assumed Windows-1252 encoding.14// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference15var replacementTable = [...]rune{16'\u20AC', // First entry is what 0x80 should be replaced with.17'\u0081',18'\u201A',19'\u0192',20'\u201E',21'\u2026',22'\u2020',23'\u2021',24'\u02C6',25'\u2030',26'\u0160',27'\u2039',28'\u0152',29'\u008D',30'\u017D',31'\u008F',32'\u0090',33'\u2018',34'\u2019',35'\u201C',36'\u201D',37'\u2022',38'\u2013',39'\u2014',40'\u02DC',41'\u2122',42'\u0161',43'\u203A',44'\u0153',45'\u009D',46'\u017E',47'\u0178', // Last entry is 0x9F.48// 0x00->'\uFFFD' is handled programmatically.49// 0x0D->'\u000D' is a no-op.50}5152// unescapeEntity reads an entity like "<" from b[src:] and writes the53// corresponding "<" to b[dst:], returning the incremented dst and src cursors.54// Precondition: b[src] == '&' && dst <= src.55// attribute should be true if parsing an attribute value.56func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {57// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference5859// i starts at 1 because we already know that s[0] == '&'.60i, s := 1, b[src:]6162if len(s) <= 1 {63b[dst] = b[src]64return dst + 1, src + 165}6667if s[i] == '#' {68if len(s) <= 3 { // We need to have at least "&#.".69b[dst] = b[src]70return dst + 1, src + 171}72i++73c := s[i]74hex := false75if c == 'x' || c == 'X' {76hex = true77i++78}7980x := '\x00'81for i < len(s) {82c = s[i]83i++84if hex {85if '0' <= c && c <= '9' {86x = 16*x + rune(c) - '0'87continue88} else if 'a' <= c && c <= 'f' {89x = 16*x + rune(c) - 'a' + 1090continue91} else if 'A' <= c && c <= 'F' {92x = 16*x + rune(c) - 'A' + 1093continue94}95} else if '0' <= c && c <= '9' {96x = 10*x + rune(c) - '0'97continue98}99if c != ';' {100i--101}102break103}104105if i <= 3 { // No characters matched.106b[dst] = b[src]107return dst + 1, src + 1108}109110if 0x80 <= x && x <= 0x9F {111// Replace characters from Windows-1252 with UTF-8 equivalents.112x = replacementTable[x-0x80]113} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {114// Replace invalid characters with the replacement character.115x = '\uFFFD'116}117118return dst + utf8.EncodeRune(b[dst:], x), src + i119}120121// Consume the maximum number of characters possible, with the122// consumed characters matching one of the named references.123124for i < len(s) {125c := s[i]126i++127// Lower-cased characters are more common in entities, so we check for them first.128if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {129continue130}131if c != ';' {132i--133}134break135}136137entityName := string(s[1:i])138if entityName == "" {139// No-op.140} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {141// No-op.142} else if x := entity[entityName]; x != 0 {143return dst + utf8.EncodeRune(b[dst:], x), src + i144} else if x := entity2[entityName]; x[0] != 0 {145dst1 := dst + utf8.EncodeRune(b[dst:], x[0])146return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i147} else if !attribute {148maxLen := len(entityName) - 1149if maxLen > longestEntityWithoutSemicolon {150maxLen = longestEntityWithoutSemicolon151}152for j := maxLen; j > 1; j-- {153if x := entity[entityName[:j]]; x != 0 {154return dst + utf8.EncodeRune(b[dst:], x), src + j + 1155}156}157}158159dst1, src1 = dst+i, src+i160copy(b[dst:dst1], b[src:src1])161return dst1, src1162}163164// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b".165// attribute should be true if parsing an attribute value.166func unescape(b []byte, attribute bool) []byte {167for i, c := range b {168if c == '&' {169dst, src := unescapeEntity(b, i, i, attribute)170for src < len(b) {171c := b[src]172if c == '&' {173dst, src = unescapeEntity(b, dst, src, attribute)174} else {175b[dst] = c176dst, src = dst+1, src+1177}178}179return b[0:dst]180}181}182return b183}184185// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".186func lower(b []byte) []byte {187for i, c := range b {188if 'A' <= c && c <= 'Z' {189b[i] = c + 'a' - 'A'190}191}192return b193}194195// escapeComment is like func escape but escapes its input bytes less often.196// Per https://github.com/golang/go/issues/58246 some HTML comments are (1)197// meaningful and (2) contain angle brackets that we'd like to avoid escaping198// unless we have to.199//200// "We have to" includes the '&' byte, since that introduces other escapes.201//202// It also includes those bytes (not including EOF) that would otherwise end203// the comment. Per the summary table at the bottom of comment_test.go, this is204// the '>' byte that, per above, we'd like to avoid escaping unless we have to.205//206// Studying the summary table (and T actions in its '>' column) closely, we207// only need to escape in states 43, 44, 49, 51 and 52. State 43 is at the208// start of the comment data. State 52 is after a '!'. The other three states209// are after a '-'.210//211// Our algorithm is thus to escape every '&' and to escape '>' if and only if:212// - The '>' is after a '!' or '-' (in the unescaped data) or213// - The '>' is at the start of the comment data (after the opening "<!--").214func escapeComment(w writer, s string) error {215// When modifying this function, consider manually increasing the216// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.217// That increase should only be temporary, not committed, as it218// exponentially affects the test running time.219220if len(s) == 0 {221return nil222}223224// Loop:225// - Grow j such that s[i:j] does not need escaping.226// - If s[j] does need escaping, output s[i:j] and an escaped s[j],227// resetting i and j to point past that s[j] byte.228i := 0229for j := 0; j < len(s); j++ {230escaped := ""231switch s[j] {232case '&':233escaped = "&"234235case '>':236if j > 0 {237if prev := s[j-1]; (prev != '!') && (prev != '-') {238continue239}240}241escaped = ">"242243default:244continue245}246247if i < j {248if _, err := w.WriteString(s[i:j]); err != nil {249return err250}251}252if _, err := w.WriteString(escaped); err != nil {253return err254}255i = j + 1256}257258if i < len(s) {259if _, err := w.WriteString(s[i:]); err != nil {260return err261}262}263return nil264}265266// escapeCommentString is to EscapeString as escapeComment is to escape.267func escapeCommentString(s string) string {268if strings.IndexAny(s, "&>") == -1 {269return s270}271var buf bytes.Buffer272escapeComment(&buf, s)273return buf.String()274}275276const escapedChars = "&'<>\"\r"277278func escape(w writer, s string) error {279i := strings.IndexAny(s, escapedChars)280for i != -1 {281if _, err := w.WriteString(s[:i]); err != nil {282return err283}284var esc string285switch s[i] {286case '&':287esc = "&"288case '\'':289// "'" is shorter than "'" and apos was not in HTML until HTML5.290esc = "'"291case '<':292esc = "<"293case '>':294esc = ">"295case '"':296// """ is shorter than """.297esc = """298case '\r':299esc = " "300default:301panic("html: unrecognized escape character")302}303s = s[i+1:]304if _, err := w.WriteString(esc); err != nil {305return err306}307i = strings.IndexAny(s, escapedChars)308}309_, err := w.WriteString(s)310return err311}312313// EscapeString escapes special characters like "<" to become "<". It314// escapes only five such characters: <, >, &, ' and ".315// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't316// always true.317func EscapeString(s string) string {318if strings.IndexAny(s, escapedChars) == -1 {319return s320}321var buf bytes.Buffer322escape(&buf, s)323return buf.String()324}325326// UnescapeString unescapes entities like "<" to become "<". It unescapes a327// larger range of entities than EscapeString escapes. For example, "á"328// unescapes to "á", as does "á" and "&xE1;".329// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't330// always true.331func UnescapeString(s string) string {332for _, c := range s {333if c == '&' {334return string(unescape([]byte(s), false))335}336}337return s338}339340341