Path: blob/main/vendor/golang.org/x/net/html/token.go
2880 views
// Copyright 2010 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package html56import (7"bytes"8"errors"9"io"10"strconv"11"strings"1213"golang.org/x/net/html/atom"14)1516// A TokenType is the type of a Token.17type TokenType uint321819const (20// ErrorToken means that an error occurred during tokenization.21ErrorToken TokenType = iota22// TextToken means a text node.23TextToken24// A StartTagToken looks like <a>.25StartTagToken26// An EndTagToken looks like </a>.27EndTagToken28// A SelfClosingTagToken tag looks like <br/>.29SelfClosingTagToken30// A CommentToken looks like <!--x-->.31CommentToken32// A DoctypeToken looks like <!DOCTYPE x>33DoctypeToken34)3536// ErrBufferExceeded means that the buffering limit was exceeded.37var ErrBufferExceeded = errors.New("max buffer exceeded")3839// String returns a string representation of the TokenType.40func (t TokenType) String() string {41switch t {42case ErrorToken:43return "Error"44case TextToken:45return "Text"46case StartTagToken:47return "StartTag"48case EndTagToken:49return "EndTag"50case SelfClosingTagToken:51return "SelfClosingTag"52case CommentToken:53return "Comment"54case DoctypeToken:55return "Doctype"56}57return "Invalid(" + strconv.Itoa(int(t)) + ")"58}5960// An Attribute is an attribute namespace-key-value triple. Namespace is61// non-empty for foreign attributes like xlink, Key is alphabetic (and hence62// does not contain escapable characters like '&', '<' or '>'), and Val is63// unescaped (it looks like "a<b" rather than "a<b").64//65// Namespace is only used by the parser, not the tokenizer.66type Attribute struct {67Namespace, Key, Val string68}6970// A Token consists of a TokenType and some Data (tag name for start and end71// tags, content for text, comments and doctypes). A tag Token may also contain72// a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"73// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or74// zero if Data is not a known tag name.75type Token struct {76Type TokenType77DataAtom atom.Atom78Data string79Attr []Attribute80}8182// tagString returns a string representation of a tag Token's Data and Attr.83func (t Token) tagString() string {84if len(t.Attr) == 0 {85return t.Data86}87buf := bytes.NewBufferString(t.Data)88for _, a := range t.Attr {89buf.WriteByte(' ')90buf.WriteString(a.Key)91buf.WriteString(`="`)92escape(buf, a.Val)93buf.WriteByte('"')94}95return buf.String()96}9798// String returns a string representation of the Token.99func (t Token) String() string {100switch t.Type {101case ErrorToken:102return ""103case TextToken:104return EscapeString(t.Data)105case StartTagToken:106return "<" + t.tagString() + ">"107case EndTagToken:108return "</" + t.tagString() + ">"109case SelfClosingTagToken:110return "<" + t.tagString() + "/>"111case CommentToken:112return "<!--" + escapeCommentString(t.Data) + "-->"113case DoctypeToken:114return "<!DOCTYPE " + EscapeString(t.Data) + ">"115}116return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"117}118119// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,120// the end is exclusive.121type span struct {122start, end int123}124125// A Tokenizer returns a stream of HTML Tokens.126type Tokenizer struct {127// r is the source of the HTML text.128r io.Reader129// tt is the TokenType of the current token.130tt TokenType131// err is the first error encountered during tokenization. It is possible132// for tt != Error && err != nil to hold: this means that Next returned a133// valid token but the subsequent Next call will return an error token.134// For example, if the HTML text input was just "plain", then the first135// Next call would set z.err to io.EOF but return a TextToken, and all136// subsequent Next calls would return an ErrorToken.137// err is never reset. Once it becomes non-nil, it stays non-nil.138err error139// readErr is the error returned by the io.Reader r. It is separate from140// err because it is valid for an io.Reader to return (n int, err1 error)141// such that n > 0 && err1 != nil, and callers should always process the142// n > 0 bytes before considering the error err1.143readErr error144// buf[raw.start:raw.end] holds the raw bytes of the current token.145// buf[raw.end:] is buffered input that will yield future tokens.146raw span147buf []byte148// maxBuf limits the data buffered in buf. A value of 0 means unlimited.149maxBuf int150// buf[data.start:data.end] holds the raw bytes of the current token's data:151// a text token's text, a tag token's tag name, etc.152data span153// pendingAttr is the attribute key and value currently being tokenized.154// When complete, pendingAttr is pushed onto attr. nAttrReturned is155// incremented on each call to TagAttr.156pendingAttr [2]span157attr [][2]span158nAttrReturned int159// rawTag is the "script" in "</script>" that closes the next token. If160// non-empty, the subsequent call to Next will return a raw or RCDATA text161// token: one that treats "<p>" as text instead of an element.162// rawTag's contents are lower-cased.163rawTag string164// textIsRaw is whether the current text token's data is not escaped.165textIsRaw bool166// convertNUL is whether NUL bytes in the current token's data should167// be converted into \ufffd replacement characters.168convertNUL bool169// allowCDATA is whether CDATA sections are allowed in the current context.170allowCDATA bool171}172173// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as174// the text "foo". The default value is false, which means to recognize it as175// a bogus comment "<!-- [CDATA[foo]] -->" instead.176//177// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and178// only if tokenizing foreign content, such as MathML and SVG. However,179// tracking foreign-contentness is difficult to do purely in the tokenizer,180// as opposed to the parser, due to HTML integration points: an <svg> element181// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-182// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the183// responsibility of the user of a tokenizer to call AllowCDATA as appropriate.184// In practice, if using the tokenizer without caring whether MathML or SVG185// CDATA is text or comments, such as tokenizing HTML to find all the anchor186// text, it is acceptable to ignore this responsibility.187func (z *Tokenizer) AllowCDATA(allowCDATA bool) {188z.allowCDATA = allowCDATA189}190191// NextIsNotRawText instructs the tokenizer that the next token should not be192// considered as 'raw text'. Some elements, such as script and title elements,193// normally require the next token after the opening tag to be 'raw text' that194// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"195// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and196// an end tag token for "</title>". There are no distinct start tag or end tag197// tokens for the "<b>" and "</b>".198//199// This tokenizer implementation will generally look for raw text at the right200// times. Strictly speaking, an HTML5 compliant tokenizer should not look for201// raw text if in foreign content: <title> generally needs raw text, but a202// <title> inside an <svg> does not. Another example is that a <textarea>203// generally needs raw text, but a <textarea> is not allowed as an immediate204// child of a <select>; in normal parsing, a <textarea> implies </select>, but205// one cannot close the implicit element when parsing a <select>'s InnerHTML.206// Similarly to AllowCDATA, tracking the correct moment to override raw-text-207// ness is difficult to do purely in the tokenizer, as opposed to the parser.208// For strict compliance with the HTML5 tokenization algorithm, it is the209// responsibility of the user of a tokenizer to call NextIsNotRawText as210// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this211// responsibility for basic usage.212//213// Note that this 'raw text' concept is different from the one offered by the214// Tokenizer.Raw method.215func (z *Tokenizer) NextIsNotRawText() {216z.rawTag = ""217}218219// Err returns the error associated with the most recent ErrorToken token.220// This is typically io.EOF, meaning the end of tokenization.221func (z *Tokenizer) Err() error {222if z.tt != ErrorToken {223return nil224}225return z.err226}227228// readByte returns the next byte from the input stream, doing a buffered read229// from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte230// slice that holds all the bytes read so far for the current token.231// It sets z.err if the underlying reader returns an error.232// Pre-condition: z.err == nil.233func (z *Tokenizer) readByte() byte {234if z.raw.end >= len(z.buf) {235// Our buffer is exhausted and we have to read from z.r. Check if the236// previous read resulted in an error.237if z.readErr != nil {238z.err = z.readErr239return 0240}241// We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length242// z.raw.end - z.raw.start is more than half the capacity of z.buf, then we243// allocate a new buffer before the copy.244c := cap(z.buf)245d := z.raw.end - z.raw.start246var buf1 []byte247if 2*d > c {248buf1 = make([]byte, d, 2*c)249} else {250buf1 = z.buf[:d]251}252copy(buf1, z.buf[z.raw.start:z.raw.end])253if x := z.raw.start; x != 0 {254// Adjust the data/attr spans to refer to the same contents after the copy.255z.data.start -= x256z.data.end -= x257z.pendingAttr[0].start -= x258z.pendingAttr[0].end -= x259z.pendingAttr[1].start -= x260z.pendingAttr[1].end -= x261for i := range z.attr {262z.attr[i][0].start -= x263z.attr[i][0].end -= x264z.attr[i][1].start -= x265z.attr[i][1].end -= x266}267}268z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]269// Now that we have copied the live bytes to the start of the buffer,270// we read from z.r into the remainder.271var n int272n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)])273if n == 0 {274z.err = z.readErr275return 0276}277z.buf = buf1[:d+n]278}279x := z.buf[z.raw.end]280z.raw.end++281if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf {282z.err = ErrBufferExceeded283return 0284}285return x286}287288// Buffered returns a slice containing data buffered but not yet tokenized.289func (z *Tokenizer) Buffered() []byte {290return z.buf[z.raw.end:]291}292293// readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil).294// It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil)295// too many times in succession.296func readAtLeastOneByte(r io.Reader, b []byte) (int, error) {297for i := 0; i < 100; i++ {298if n, err := r.Read(b); n != 0 || err != nil {299return n, err300}301}302return 0, io.ErrNoProgress303}304305// skipWhiteSpace skips past any white space.306func (z *Tokenizer) skipWhiteSpace() {307if z.err != nil {308return309}310for {311c := z.readByte()312if z.err != nil {313return314}315switch c {316case ' ', '\n', '\r', '\t', '\f':317// No-op.318default:319z.raw.end--320return321}322}323}324325// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and326// is typically something like "script" or "textarea".327func (z *Tokenizer) readRawOrRCDATA() {328if z.rawTag == "script" {329z.readScript()330z.textIsRaw = true331z.rawTag = ""332return333}334loop:335for {336c := z.readByte()337if z.err != nil {338break loop339}340if c != '<' {341continue loop342}343c = z.readByte()344if z.err != nil {345break loop346}347if c != '/' {348z.raw.end--349continue loop350}351if z.readRawEndTag() || z.err != nil {352break loop353}354}355z.data.end = z.raw.end356// A textarea's or title's RCDATA can contain escaped entities.357z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"358z.rawTag = ""359}360361// readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag.362// If it succeeds, it backs up the input position to reconsume the tag and363// returns true. Otherwise it returns false. The opening "</" has already been364// consumed.365func (z *Tokenizer) readRawEndTag() bool {366for i := 0; i < len(z.rawTag); i++ {367c := z.readByte()368if z.err != nil {369return false370}371if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {372z.raw.end--373return false374}375}376c := z.readByte()377if z.err != nil {378return false379}380switch c {381case ' ', '\n', '\r', '\t', '\f', '/', '>':382// The 3 is 2 for the leading "</" plus 1 for the trailing character c.383z.raw.end -= 3 + len(z.rawTag)384return true385}386z.raw.end--387return false388}389390// readScript reads until the next </script> tag, following the byzantine391// rules for escaping/hiding the closing tag.392func (z *Tokenizer) readScript() {393defer func() {394z.data.end = z.raw.end395}()396var c byte397398scriptData:399c = z.readByte()400if z.err != nil {401return402}403if c == '<' {404goto scriptDataLessThanSign405}406goto scriptData407408scriptDataLessThanSign:409c = z.readByte()410if z.err != nil {411return412}413switch c {414case '/':415goto scriptDataEndTagOpen416case '!':417goto scriptDataEscapeStart418}419z.raw.end--420goto scriptData421422scriptDataEndTagOpen:423if z.readRawEndTag() || z.err != nil {424return425}426goto scriptData427428scriptDataEscapeStart:429c = z.readByte()430if z.err != nil {431return432}433if c == '-' {434goto scriptDataEscapeStartDash435}436z.raw.end--437goto scriptData438439scriptDataEscapeStartDash:440c = z.readByte()441if z.err != nil {442return443}444if c == '-' {445goto scriptDataEscapedDashDash446}447z.raw.end--448goto scriptData449450scriptDataEscaped:451c = z.readByte()452if z.err != nil {453return454}455switch c {456case '-':457goto scriptDataEscapedDash458case '<':459goto scriptDataEscapedLessThanSign460}461goto scriptDataEscaped462463scriptDataEscapedDash:464c = z.readByte()465if z.err != nil {466return467}468switch c {469case '-':470goto scriptDataEscapedDashDash471case '<':472goto scriptDataEscapedLessThanSign473}474goto scriptDataEscaped475476scriptDataEscapedDashDash:477c = z.readByte()478if z.err != nil {479return480}481switch c {482case '-':483goto scriptDataEscapedDashDash484case '<':485goto scriptDataEscapedLessThanSign486case '>':487goto scriptData488}489goto scriptDataEscaped490491scriptDataEscapedLessThanSign:492c = z.readByte()493if z.err != nil {494return495}496if c == '/' {497goto scriptDataEscapedEndTagOpen498}499if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {500goto scriptDataDoubleEscapeStart501}502z.raw.end--503goto scriptData504505scriptDataEscapedEndTagOpen:506if z.readRawEndTag() || z.err != nil {507return508}509goto scriptDataEscaped510511scriptDataDoubleEscapeStart:512z.raw.end--513for i := 0; i < len("script"); i++ {514c = z.readByte()515if z.err != nil {516return517}518if c != "script"[i] && c != "SCRIPT"[i] {519z.raw.end--520goto scriptDataEscaped521}522}523c = z.readByte()524if z.err != nil {525return526}527switch c {528case ' ', '\n', '\r', '\t', '\f', '/', '>':529goto scriptDataDoubleEscaped530}531z.raw.end--532goto scriptDataEscaped533534scriptDataDoubleEscaped:535c = z.readByte()536if z.err != nil {537return538}539switch c {540case '-':541goto scriptDataDoubleEscapedDash542case '<':543goto scriptDataDoubleEscapedLessThanSign544}545goto scriptDataDoubleEscaped546547scriptDataDoubleEscapedDash:548c = z.readByte()549if z.err != nil {550return551}552switch c {553case '-':554goto scriptDataDoubleEscapedDashDash555case '<':556goto scriptDataDoubleEscapedLessThanSign557}558goto scriptDataDoubleEscaped559560scriptDataDoubleEscapedDashDash:561c = z.readByte()562if z.err != nil {563return564}565switch c {566case '-':567goto scriptDataDoubleEscapedDashDash568case '<':569goto scriptDataDoubleEscapedLessThanSign570case '>':571goto scriptData572}573goto scriptDataDoubleEscaped574575scriptDataDoubleEscapedLessThanSign:576c = z.readByte()577if z.err != nil {578return579}580if c == '/' {581goto scriptDataDoubleEscapeEnd582}583z.raw.end--584goto scriptDataDoubleEscaped585586scriptDataDoubleEscapeEnd:587if z.readRawEndTag() {588z.raw.end += len("</script>")589goto scriptDataEscaped590}591if z.err != nil {592return593}594goto scriptDataDoubleEscaped595}596597// readComment reads the next comment token starting with "<!--". The opening598// "<!--" has already been consumed.599func (z *Tokenizer) readComment() {600// When modifying this function, consider manually increasing the601// maxSuffixLen constant in func TestComments, from 6 to e.g. 9 or more.602// That increase should only be temporary, not committed, as it603// exponentially affects the test running time.604605z.data.start = z.raw.end606defer func() {607if z.data.end < z.data.start {608// It's a comment with no data, like <!-->.609z.data.end = z.data.start610}611}()612613var dashCount int614beginning := true615for {616c := z.readByte()617if z.err != nil {618z.data.end = z.calculateAbruptCommentDataEnd()619return620}621switch c {622case '-':623dashCount++624continue625case '>':626if dashCount >= 2 || beginning {627z.data.end = z.raw.end - len("-->")628return629}630case '!':631if dashCount >= 2 {632c = z.readByte()633if z.err != nil {634z.data.end = z.calculateAbruptCommentDataEnd()635return636} else if c == '>' {637z.data.end = z.raw.end - len("--!>")638return639} else if c == '-' {640dashCount = 1641beginning = false642continue643}644}645}646dashCount = 0647beginning = false648}649}650651func (z *Tokenizer) calculateAbruptCommentDataEnd() int {652raw := z.Raw()653const prefixLen = len("<!--")654if len(raw) >= prefixLen {655raw = raw[prefixLen:]656if hasSuffix(raw, "--!") {657return z.raw.end - 3658} else if hasSuffix(raw, "--") {659return z.raw.end - 2660} else if hasSuffix(raw, "-") {661return z.raw.end - 1662}663}664return z.raw.end665}666667func hasSuffix(b []byte, suffix string) bool {668if len(b) < len(suffix) {669return false670}671b = b[len(b)-len(suffix):]672for i := range b {673if b[i] != suffix[i] {674return false675}676}677return true678}679680// readUntilCloseAngle reads until the next ">".681func (z *Tokenizer) readUntilCloseAngle() {682z.data.start = z.raw.end683for {684c := z.readByte()685if z.err != nil {686z.data.end = z.raw.end687return688}689if c == '>' {690z.data.end = z.raw.end - len(">")691return692}693}694}695696// readMarkupDeclaration reads the next token starting with "<!". It might be697// a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or698// "<!a bogus comment". The opening "<!" has already been consumed.699func (z *Tokenizer) readMarkupDeclaration() TokenType {700z.data.start = z.raw.end701var c [2]byte702for i := 0; i < 2; i++ {703c[i] = z.readByte()704if z.err != nil {705z.data.end = z.raw.end706return CommentToken707}708}709if c[0] == '-' && c[1] == '-' {710z.readComment()711return CommentToken712}713z.raw.end -= 2714if z.readDoctype() {715return DoctypeToken716}717if z.allowCDATA && z.readCDATA() {718z.convertNUL = true719return TextToken720}721// It's a bogus comment.722z.readUntilCloseAngle()723return CommentToken724}725726// readDoctype attempts to read a doctype declaration and returns true if727// successful. The opening "<!" has already been consumed.728func (z *Tokenizer) readDoctype() bool {729const s = "DOCTYPE"730for i := 0; i < len(s); i++ {731c := z.readByte()732if z.err != nil {733z.data.end = z.raw.end734return false735}736if c != s[i] && c != s[i]+('a'-'A') {737// Back up to read the fragment of "DOCTYPE" again.738z.raw.end = z.data.start739return false740}741}742if z.skipWhiteSpace(); z.err != nil {743z.data.start = z.raw.end744z.data.end = z.raw.end745return true746}747z.readUntilCloseAngle()748return true749}750751// readCDATA attempts to read a CDATA section and returns true if752// successful. The opening "<!" has already been consumed.753func (z *Tokenizer) readCDATA() bool {754const s = "[CDATA["755for i := 0; i < len(s); i++ {756c := z.readByte()757if z.err != nil {758z.data.end = z.raw.end759return false760}761if c != s[i] {762// Back up to read the fragment of "[CDATA[" again.763z.raw.end = z.data.start764return false765}766}767z.data.start = z.raw.end768brackets := 0769for {770c := z.readByte()771if z.err != nil {772z.data.end = z.raw.end773return true774}775switch c {776case ']':777brackets++778case '>':779if brackets >= 2 {780z.data.end = z.raw.end - len("]]>")781return true782}783brackets = 0784default:785brackets = 0786}787}788}789790// startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]791// case-insensitively matches any element of ss.792func (z *Tokenizer) startTagIn(ss ...string) bool {793loop:794for _, s := range ss {795if z.data.end-z.data.start != len(s) {796continue loop797}798for i := 0; i < len(s); i++ {799c := z.buf[z.data.start+i]800if 'A' <= c && c <= 'Z' {801c += 'a' - 'A'802}803if c != s[i] {804continue loop805}806}807return true808}809return false810}811812// readStartTag reads the next start tag token. The opening "<a" has already813// been consumed, where 'a' means anything in [A-Za-z].814func (z *Tokenizer) readStartTag() TokenType {815z.readTag(true)816if z.err != nil {817return ErrorToken818}819// Several tags flag the tokenizer's next token as raw.820c, raw := z.buf[z.data.start], false821if 'A' <= c && c <= 'Z' {822c += 'a' - 'A'823}824switch c {825case 'i':826raw = z.startTagIn("iframe")827case 'n':828raw = z.startTagIn("noembed", "noframes", "noscript")829case 'p':830raw = z.startTagIn("plaintext")831case 's':832raw = z.startTagIn("script", "style")833case 't':834raw = z.startTagIn("textarea", "title")835case 'x':836raw = z.startTagIn("xmp")837}838if raw {839z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end]))840}841// Look for a self-closing token (e.g. <br/>).842//843// Originally, we did this by just checking that the last character of the844// tag (ignoring the closing bracket) was a solidus (/) character, but this845// is not always accurate.846//847// We need to be careful that we don't misinterpret a non-self-closing tag848// as self-closing, as can happen if the tag contains unquoted attribute849// values (i.e. <p a=/>).850//851// To avoid this, we check that the last non-bracket character of the tag852// (z.raw.end-2) isn't the same character as the last non-quote character of853// the last attribute of the tag (z.pendingAttr[1].end-1), if the tag has854// attributes.855nAttrs := len(z.attr)856if z.err == nil && z.buf[z.raw.end-2] == '/' && (nAttrs == 0 || z.raw.end-2 != z.attr[nAttrs-1][1].end-1) {857return SelfClosingTagToken858}859return StartTagToken860}861862// readTag reads the next tag token and its attributes. If saveAttr, those863// attributes are saved in z.attr, otherwise z.attr is set to an empty slice.864// The opening "<a" or "</a" has already been consumed, where 'a' means anything865// in [A-Za-z].866func (z *Tokenizer) readTag(saveAttr bool) {867z.attr = z.attr[:0]868z.nAttrReturned = 0869// Read the tag name and attribute key/value pairs.870z.readTagName()871if z.skipWhiteSpace(); z.err != nil {872return873}874for {875c := z.readByte()876if z.err != nil || c == '>' {877break878}879z.raw.end--880z.readTagAttrKey()881z.readTagAttrVal()882// Save pendingAttr if saveAttr and that attribute has a non-empty key.883if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end {884z.attr = append(z.attr, z.pendingAttr)885}886if z.skipWhiteSpace(); z.err != nil {887break888}889}890}891892// readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)893// is positioned such that the first byte of the tag name (the "d" in "<div")894// has already been consumed.895func (z *Tokenizer) readTagName() {896z.data.start = z.raw.end - 1897for {898c := z.readByte()899if z.err != nil {900z.data.end = z.raw.end901return902}903switch c {904case ' ', '\n', '\r', '\t', '\f':905z.data.end = z.raw.end - 1906return907case '/', '>':908z.raw.end--909z.data.end = z.raw.end910return911}912}913}914915// readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".916// Precondition: z.err == nil.917func (z *Tokenizer) readTagAttrKey() {918z.pendingAttr[0].start = z.raw.end919for {920c := z.readByte()921if z.err != nil {922z.pendingAttr[0].end = z.raw.end923return924}925switch c {926case '=':927if z.pendingAttr[0].start+1 == z.raw.end {928// WHATWG 13.2.5.32, if we see an equals sign before the attribute name929// begins, we treat it as a character in the attribute name and continue.930continue931}932fallthrough933case ' ', '\n', '\r', '\t', '\f', '/', '>':934// WHATWG 13.2.5.33 Attribute name state935// We need to reconsume the char in the after attribute name state to support the / character936z.raw.end--937z.pendingAttr[0].end = z.raw.end938return939}940}941}942943// readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".944func (z *Tokenizer) readTagAttrVal() {945z.pendingAttr[1].start = z.raw.end946z.pendingAttr[1].end = z.raw.end947if z.skipWhiteSpace(); z.err != nil {948return949}950c := z.readByte()951if z.err != nil {952return953}954if c == '/' {955// WHATWG 13.2.5.34 After attribute name state956// U+002F SOLIDUS (/) - Switch to the self-closing start tag state.957return958}959if c != '=' {960z.raw.end--961return962}963if z.skipWhiteSpace(); z.err != nil {964return965}966quote := z.readByte()967if z.err != nil {968return969}970switch quote {971case '>':972z.raw.end--973return974975case '\'', '"':976z.pendingAttr[1].start = z.raw.end977for {978c := z.readByte()979if z.err != nil {980z.pendingAttr[1].end = z.raw.end981return982}983if c == quote {984z.pendingAttr[1].end = z.raw.end - 1985return986}987}988989default:990z.pendingAttr[1].start = z.raw.end - 1991for {992c := z.readByte()993if z.err != nil {994z.pendingAttr[1].end = z.raw.end995return996}997switch c {998case ' ', '\n', '\r', '\t', '\f':999z.pendingAttr[1].end = z.raw.end - 11000return1001case '>':1002z.raw.end--1003z.pendingAttr[1].end = z.raw.end1004return1005}1006}1007}1008}10091010// Next scans the next token and returns its type.1011func (z *Tokenizer) Next() TokenType {1012z.raw.start = z.raw.end1013z.data.start = z.raw.end1014z.data.end = z.raw.end1015if z.err != nil {1016z.tt = ErrorToken1017return z.tt1018}1019if z.rawTag != "" {1020if z.rawTag == "plaintext" {1021// Read everything up to EOF.1022for z.err == nil {1023z.readByte()1024}1025z.data.end = z.raw.end1026z.textIsRaw = true1027} else {1028z.readRawOrRCDATA()1029}1030if z.data.end > z.data.start {1031z.tt = TextToken1032z.convertNUL = true1033return z.tt1034}1035}1036z.textIsRaw = false1037z.convertNUL = false10381039loop:1040for {1041c := z.readByte()1042if z.err != nil {1043break loop1044}1045if c != '<' {1046continue loop1047}10481049// Check if the '<' we have just read is part of a tag, comment1050// or doctype. If not, it's part of the accumulated text token.1051c = z.readByte()1052if z.err != nil {1053break loop1054}1055var tokenType TokenType1056switch {1057case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':1058tokenType = StartTagToken1059case c == '/':1060tokenType = EndTagToken1061case c == '!' || c == '?':1062// We use CommentToken to mean any of "<!--actual comments-->",1063// "<!DOCTYPE declarations>" and "<?xml processing instructions?>".1064tokenType = CommentToken1065default:1066// Reconsume the current character.1067z.raw.end--1068continue1069}10701071// We have a non-text token, but we might have accumulated some text1072// before that. If so, we return the text first, and return the non-1073// text token on the subsequent call to Next.1074if x := z.raw.end - len("<a"); z.raw.start < x {1075z.raw.end = x1076z.data.end = x1077z.tt = TextToken1078return z.tt1079}1080switch tokenType {1081case StartTagToken:1082z.tt = z.readStartTag()1083return z.tt1084case EndTagToken:1085c = z.readByte()1086if z.err != nil {1087break loop1088}1089if c == '>' {1090// "</>" does not generate a token at all. Generate an empty comment1091// to allow passthrough clients to pick up the data using Raw.1092// Reset the tokenizer state and start again.1093z.tt = CommentToken1094return z.tt1095}1096if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {1097z.readTag(false)1098if z.err != nil {1099z.tt = ErrorToken1100} else {1101z.tt = EndTagToken1102}1103return z.tt1104}1105z.raw.end--1106z.readUntilCloseAngle()1107z.tt = CommentToken1108return z.tt1109case CommentToken:1110if c == '!' {1111z.tt = z.readMarkupDeclaration()1112return z.tt1113}1114z.raw.end--1115z.readUntilCloseAngle()1116z.tt = CommentToken1117return z.tt1118}1119}1120if z.raw.start < z.raw.end {1121z.data.end = z.raw.end1122z.tt = TextToken1123return z.tt1124}1125z.tt = ErrorToken1126return z.tt1127}11281129// Raw returns the unmodified text of the current token. Calling Next, Token,1130// Text, TagName or TagAttr may change the contents of the returned slice.1131//1132// The token stream's raw bytes partition the byte stream (up until an1133// ErrorToken). There are no overlaps or gaps between two consecutive token's1134// raw bytes. One implication is that the byte offset of the current token is1135// the sum of the lengths of all previous tokens' raw bytes.1136func (z *Tokenizer) Raw() []byte {1137return z.buf[z.raw.start:z.raw.end]1138}11391140// convertNewlines converts "\r" and "\r\n" in s to "\n".1141// The conversion happens in place, but the resulting slice may be shorter.1142func convertNewlines(s []byte) []byte {1143for i, c := range s {1144if c != '\r' {1145continue1146}11471148src := i + 11149if src >= len(s) || s[src] != '\n' {1150s[i] = '\n'1151continue1152}11531154dst := i1155for src < len(s) {1156if s[src] == '\r' {1157if src+1 < len(s) && s[src+1] == '\n' {1158src++1159}1160s[dst] = '\n'1161} else {1162s[dst] = s[src]1163}1164src++1165dst++1166}1167return s[:dst]1168}1169return s1170}11711172var (1173nul = []byte("\x00")1174replacement = []byte("\ufffd")1175)11761177// Text returns the unescaped text of a text, comment or doctype token. The1178// contents of the returned slice may change on the next call to Next.1179func (z *Tokenizer) Text() []byte {1180switch z.tt {1181case TextToken, CommentToken, DoctypeToken:1182s := z.buf[z.data.start:z.data.end]1183z.data.start = z.raw.end1184z.data.end = z.raw.end1185s = convertNewlines(s)1186if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) {1187s = bytes.Replace(s, nul, replacement, -1)1188}1189if !z.textIsRaw {1190s = unescape(s, false)1191}1192return s1193}1194return nil1195}11961197// TagName returns the lower-cased name of a tag token (the `img` out of1198// `<IMG SRC="foo">`) and whether the tag has attributes.1199// The contents of the returned slice may change on the next call to Next.1200func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {1201if z.data.start < z.data.end {1202switch z.tt {1203case StartTagToken, EndTagToken, SelfClosingTagToken:1204s := z.buf[z.data.start:z.data.end]1205z.data.start = z.raw.end1206z.data.end = z.raw.end1207return lower(s), z.nAttrReturned < len(z.attr)1208}1209}1210return nil, false1211}12121213// TagAttr returns the lower-cased key and unescaped value of the next unparsed1214// attribute for the current tag token and whether there are more attributes.1215// The contents of the returned slices may change on the next call to Next.1216func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {1217if z.nAttrReturned < len(z.attr) {1218switch z.tt {1219case StartTagToken, SelfClosingTagToken:1220x := z.attr[z.nAttrReturned]1221z.nAttrReturned++1222key = z.buf[x[0].start:x[0].end]1223val = z.buf[x[1].start:x[1].end]1224return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr)1225}1226}1227return nil, nil, false1228}12291230// Token returns the current Token. The result's Data and Attr values remain1231// valid after subsequent Next calls.1232func (z *Tokenizer) Token() Token {1233t := Token{Type: z.tt}1234switch z.tt {1235case TextToken, CommentToken, DoctypeToken:1236t.Data = string(z.Text())1237case StartTagToken, SelfClosingTagToken, EndTagToken:1238name, moreAttr := z.TagName()1239for moreAttr {1240var key, val []byte1241key, val, moreAttr = z.TagAttr()1242t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})1243}1244if a := atom.Lookup(name); a != 0 {1245t.DataAtom, t.Data = a, a.String()1246} else {1247t.DataAtom, t.Data = 0, string(name)1248}1249}1250return t1251}12521253// SetMaxBuf sets a limit on the amount of data buffered during tokenization.1254// A value of 0 means unlimited.1255func (z *Tokenizer) SetMaxBuf(n int) {1256z.maxBuf = n1257}12581259// NewTokenizer returns a new HTML Tokenizer for the given Reader.1260// The input is assumed to be UTF-8 encoded.1261func NewTokenizer(r io.Reader) *Tokenizer {1262return NewTokenizerFragment(r, "")1263}12641265// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for1266// tokenizing an existing element's InnerHTML fragment. contextTag is that1267// element's tag, such as "div" or "iframe".1268//1269// For example, how the InnerHTML "a<b" is tokenized depends on whether it is1270// for a <p> tag or a <script> tag.1271//1272// The input is assumed to be UTF-8 encoded.1273func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {1274z := &Tokenizer{1275r: r,1276buf: make([]byte, 0, 4096),1277}1278if contextTag != "" {1279switch s := strings.ToLower(contextTag); s {1280case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":1281z.rawTag = s1282}1283}1284return z1285}128612871288