Path: blob/main/vendor/golang.org/x/net/html/parse.go
2880 views
// Copyright 2010 The Go Authors. All rights reserved.1// Use of this source code is governed by a BSD-style2// license that can be found in the LICENSE file.34package html56import (7"errors"8"fmt"9"io"10"strings"1112a "golang.org/x/net/html/atom"13)1415// A parser implements the HTML5 parsing algorithm:16// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction17type parser struct {18// tokenizer provides the tokens for the parser.19tokenizer *Tokenizer20// tok is the most recently read token.21tok Token22// Self-closing tags like <hr/> are treated as start tags, except that23// hasSelfClosingToken is set while they are being processed.24hasSelfClosingToken bool25// doc is the document root element.26doc *Node27// The stack of open elements (section 12.2.4.2) and active formatting28// elements (section 12.2.4.3).29oe, afe nodeStack30// Element pointers (section 12.2.4.4).31head, form *Node32// Other parsing state flags (section 12.2.4.5).33scripting, framesetOK bool34// The stack of template insertion modes35templateStack insertionModeStack36// im is the current insertion mode.37im insertionMode38// originalIM is the insertion mode to go back to after completing a text39// or inTableText insertion mode.40originalIM insertionMode41// fosterParenting is whether new elements should be inserted according to42// the foster parenting rules (section 12.2.6.1).43fosterParenting bool44// quirks is whether the parser is operating in "quirks mode."45quirks bool46// fragment is whether the parser is parsing an HTML fragment.47fragment bool48// context is the context element when parsing an HTML fragment49// (section 12.4).50context *Node51}5253func (p *parser) top() *Node {54if n := p.oe.top(); n != nil {55return n56}57return p.doc58}5960// Stop tags for use in popUntil. These come from section 12.2.4.2.61var (62defaultScopeStopTags = map[string][]a.Atom{63"": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},64"math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},65"svg": {a.Desc, a.ForeignObject, a.Title},66}67)6869type scope int7071const (72defaultScope scope = iota73listItemScope74buttonScope75tableScope76tableRowScope77tableBodyScope78selectScope79)8081// popUntil pops the stack of open elements at the highest element whose tag82// is in matchTags, provided there is no higher element in the scope's stop83// tags (as defined in section 12.2.4.2). It returns whether or not there was84// such an element. If there was not, popUntil leaves the stack unchanged.85//86// For example, the set of stop tags for table scope is: "html", "table". If87// the stack was:88// ["html", "body", "font", "table", "b", "i", "u"]89// then popUntil(tableScope, "font") would return false, but90// popUntil(tableScope, "i") would return true and the stack would become:91// ["html", "body", "font", "table", "b"]92//93// If an element's tag is in both the stop tags and matchTags, then the stack94// will be popped and the function returns true (provided, of course, there was95// no higher element in the stack that was also in the stop tags). For example,96// popUntil(tableScope, "table") returns true and leaves:97// ["html", "body", "font"]98func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {99if i := p.indexOfElementInScope(s, matchTags...); i != -1 {100p.oe = p.oe[:i]101return true102}103return false104}105106// indexOfElementInScope returns the index in p.oe of the highest element whose107// tag is in matchTags that is in scope. If no matching element is in scope, it108// returns -1.109func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {110for i := len(p.oe) - 1; i >= 0; i-- {111tagAtom := p.oe[i].DataAtom112if p.oe[i].Namespace == "" {113for _, t := range matchTags {114if t == tagAtom {115return i116}117}118switch s {119case defaultScope:120// No-op.121case listItemScope:122if tagAtom == a.Ol || tagAtom == a.Ul {123return -1124}125case buttonScope:126if tagAtom == a.Button {127return -1128}129case tableScope:130if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {131return -1132}133case selectScope:134if tagAtom != a.Optgroup && tagAtom != a.Option {135return -1136}137default:138panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s))139}140}141switch s {142case defaultScope, listItemScope, buttonScope:143for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {144if t == tagAtom {145return -1146}147}148}149}150return -1151}152153// elementInScope is like popUntil, except that it doesn't modify the stack of154// open elements.155func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {156return p.indexOfElementInScope(s, matchTags...) != -1157}158159// clearStackToContext pops elements off the stack of open elements until a160// scope-defined element is found.161func (p *parser) clearStackToContext(s scope) {162for i := len(p.oe) - 1; i >= 0; i-- {163tagAtom := p.oe[i].DataAtom164switch s {165case tableScope:166if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {167p.oe = p.oe[:i+1]168return169}170case tableRowScope:171if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {172p.oe = p.oe[:i+1]173return174}175case tableBodyScope:176if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {177p.oe = p.oe[:i+1]178return179}180default:181panic(fmt.Sprintf("html: internal error: clearStackToContext unknown scope: %d", s))182}183}184}185186// parseGenericRawTextElement implements the generic raw text element parsing187// algorithm defined in 12.2.6.2.188// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text189// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part190// officially, need to make tokenizer consider both states.191func (p *parser) parseGenericRawTextElement() {192p.addElement()193p.originalIM = p.im194p.im = textIM195}196197// generateImpliedEndTags pops nodes off the stack of open elements as long as198// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.199// If exceptions are specified, nodes with that name will not be popped off.200func (p *parser) generateImpliedEndTags(exceptions ...string) {201var i int202loop:203for i = len(p.oe) - 1; i >= 0; i-- {204n := p.oe[i]205if n.Type != ElementNode {206break207}208switch n.DataAtom {209case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:210for _, except := range exceptions {211if n.Data == except {212break loop213}214}215continue216}217break218}219220p.oe = p.oe[:i+1]221}222223// addChild adds a child node n to the top element, and pushes n onto the stack224// of open elements if it is an element node.225func (p *parser) addChild(n *Node) {226if p.shouldFosterParent() {227p.fosterParent(n)228} else {229p.top().AppendChild(n)230}231232if n.Type == ElementNode {233p.insertOpenElement(n)234}235}236237func (p *parser) insertOpenElement(n *Node) {238p.oe = append(p.oe, n)239if len(p.oe) > 512 {240panic("html: open stack of elements exceeds 512 nodes")241}242}243244// shouldFosterParent returns whether the next node to be added should be245// foster parented.246func (p *parser) shouldFosterParent() bool {247if p.fosterParenting {248switch p.top().DataAtom {249case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:250return true251}252}253return false254}255256// fosterParent adds a child node according to the foster parenting rules.257// Section 12.2.6.1, "foster parenting".258func (p *parser) fosterParent(n *Node) {259var table, parent, prev, template *Node260var i int261for i = len(p.oe) - 1; i >= 0; i-- {262if p.oe[i].DataAtom == a.Table {263table = p.oe[i]264break265}266}267268var j int269for j = len(p.oe) - 1; j >= 0; j-- {270if p.oe[j].DataAtom == a.Template {271template = p.oe[j]272break273}274}275276if template != nil && (table == nil || j > i) {277template.AppendChild(n)278return279}280281if table == nil {282// The foster parent is the html element.283parent = p.oe[0]284} else {285parent = table.Parent286}287if parent == nil {288parent = p.oe[i-1]289}290291if table != nil {292prev = table.PrevSibling293} else {294prev = parent.LastChild295}296if prev != nil && prev.Type == TextNode && n.Type == TextNode {297prev.Data += n.Data298return299}300301parent.InsertBefore(n, table)302}303304// addText adds text to the preceding node if it is a text node, or else it305// calls addChild with a new text node.306func (p *parser) addText(text string) {307if text == "" {308return309}310311if p.shouldFosterParent() {312p.fosterParent(&Node{313Type: TextNode,314Data: text,315})316return317}318319t := p.top()320if n := t.LastChild; n != nil && n.Type == TextNode {321n.Data += text322return323}324p.addChild(&Node{325Type: TextNode,326Data: text,327})328}329330// addElement adds a child element based on the current token.331func (p *parser) addElement() {332p.addChild(&Node{333Type: ElementNode,334DataAtom: p.tok.DataAtom,335Data: p.tok.Data,336Attr: p.tok.Attr,337})338}339340// Section 12.2.4.3.341func (p *parser) addFormattingElement() {342tagAtom, attr := p.tok.DataAtom, p.tok.Attr343p.addElement()344345// Implement the Noah's Ark clause, but with three per family instead of two.346identicalElements := 0347findIdenticalElements:348for i := len(p.afe) - 1; i >= 0; i-- {349n := p.afe[i]350if n.Type == scopeMarkerNode {351break352}353if n.Type != ElementNode {354continue355}356if n.Namespace != "" {357continue358}359if n.DataAtom != tagAtom {360continue361}362if len(n.Attr) != len(attr) {363continue364}365compareAttributes:366for _, t0 := range n.Attr {367for _, t1 := range attr {368if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {369// Found a match for this attribute, continue with the next attribute.370continue compareAttributes371}372}373// If we get here, there is no attribute that matches a.374// Therefore the element is not identical to the new one.375continue findIdenticalElements376}377378identicalElements++379if identicalElements >= 3 {380p.afe.remove(n)381}382}383384p.afe = append(p.afe, p.top())385}386387// Section 12.2.4.3.388func (p *parser) clearActiveFormattingElements() {389for {390if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {391return392}393}394}395396// Section 12.2.4.3.397func (p *parser) reconstructActiveFormattingElements() {398n := p.afe.top()399if n == nil {400return401}402if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {403return404}405i := len(p.afe) - 1406for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {407if i == 0 {408i = -1409break410}411i--412n = p.afe[i]413}414for {415i++416clone := p.afe[i].clone()417p.addChild(clone)418p.afe[i] = clone419if i == len(p.afe)-1 {420break421}422}423}424425// Section 12.2.5.426func (p *parser) acknowledgeSelfClosingTag() {427p.hasSelfClosingToken = false428}429430// An insertion mode (section 12.2.4.1) is the state transition function from431// a particular state in the HTML5 parser's state machine. It updates the432// parser's fields depending on parser.tok (where ErrorToken means EOF).433// It returns whether the token was consumed.434type insertionMode func(*parser) bool435436// setOriginalIM sets the insertion mode to return to after completing a text or437// inTableText insertion mode.438// Section 12.2.4.1, "using the rules for".439func (p *parser) setOriginalIM() {440if p.originalIM != nil {441panic("html: bad parser state: originalIM was set twice")442}443p.originalIM = p.im444}445446// Section 12.2.4.1, "reset the insertion mode".447func (p *parser) resetInsertionMode() {448for i := len(p.oe) - 1; i >= 0; i-- {449n := p.oe[i]450last := i == 0451if last && p.context != nil {452n = p.context453}454455switch n.DataAtom {456case a.Select:457if !last {458for ancestor, first := n, p.oe[0]; ancestor != first; {459ancestor = p.oe[p.oe.index(ancestor)-1]460switch ancestor.DataAtom {461case a.Template:462p.im = inSelectIM463return464case a.Table:465p.im = inSelectInTableIM466return467}468}469}470p.im = inSelectIM471case a.Td, a.Th:472// TODO: remove this divergence from the HTML5 spec.473//474// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668475p.im = inCellIM476case a.Tr:477p.im = inRowIM478case a.Tbody, a.Thead, a.Tfoot:479p.im = inTableBodyIM480case a.Caption:481p.im = inCaptionIM482case a.Colgroup:483p.im = inColumnGroupIM484case a.Table:485p.im = inTableIM486case a.Template:487// TODO: remove this divergence from the HTML5 spec.488if n.Namespace != "" {489continue490}491p.im = p.templateStack.top()492case a.Head:493// TODO: remove this divergence from the HTML5 spec.494//495// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668496p.im = inHeadIM497case a.Body:498p.im = inBodyIM499case a.Frameset:500p.im = inFramesetIM501case a.Html:502if p.head == nil {503p.im = beforeHeadIM504} else {505p.im = afterHeadIM506}507default:508if last {509p.im = inBodyIM510return511}512continue513}514return515}516}517518const whitespace = " \t\r\n\f"519520// Section 12.2.6.4.1.521func initialIM(p *parser) bool {522switch p.tok.Type {523case TextToken:524p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)525if len(p.tok.Data) == 0 {526// It was all whitespace, so ignore it.527return true528}529case CommentToken:530p.doc.AppendChild(&Node{531Type: CommentNode,532Data: p.tok.Data,533})534return true535case DoctypeToken:536n, quirks := parseDoctype(p.tok.Data)537p.doc.AppendChild(n)538p.quirks = quirks539p.im = beforeHTMLIM540return true541}542p.quirks = true543p.im = beforeHTMLIM544return false545}546547// Section 12.2.6.4.2.548func beforeHTMLIM(p *parser) bool {549switch p.tok.Type {550case DoctypeToken:551// Ignore the token.552return true553case TextToken:554p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)555if len(p.tok.Data) == 0 {556// It was all whitespace, so ignore it.557return true558}559case StartTagToken:560if p.tok.DataAtom == a.Html {561p.addElement()562p.im = beforeHeadIM563return true564}565case EndTagToken:566switch p.tok.DataAtom {567case a.Head, a.Body, a.Html, a.Br:568p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())569return false570default:571// Ignore the token.572return true573}574case CommentToken:575p.doc.AppendChild(&Node{576Type: CommentNode,577Data: p.tok.Data,578})579return true580}581p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())582return false583}584585// Section 12.2.6.4.3.586func beforeHeadIM(p *parser) bool {587switch p.tok.Type {588case TextToken:589p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)590if len(p.tok.Data) == 0 {591// It was all whitespace, so ignore it.592return true593}594case StartTagToken:595switch p.tok.DataAtom {596case a.Head:597p.addElement()598p.head = p.top()599p.im = inHeadIM600return true601case a.Html:602return inBodyIM(p)603}604case EndTagToken:605switch p.tok.DataAtom {606case a.Head, a.Body, a.Html, a.Br:607p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())608return false609default:610// Ignore the token.611return true612}613case CommentToken:614p.addChild(&Node{615Type: CommentNode,616Data: p.tok.Data,617})618return true619case DoctypeToken:620// Ignore the token.621return true622}623624p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())625return false626}627628// Section 12.2.6.4.4.629func inHeadIM(p *parser) bool {630switch p.tok.Type {631case TextToken:632s := strings.TrimLeft(p.tok.Data, whitespace)633if len(s) < len(p.tok.Data) {634// Add the initial whitespace to the current node.635p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])636if s == "" {637return true638}639p.tok.Data = s640}641case StartTagToken:642switch p.tok.DataAtom {643case a.Html:644return inBodyIM(p)645case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:646p.addElement()647p.oe.pop()648p.acknowledgeSelfClosingTag()649return true650case a.Noscript:651if p.scripting {652p.parseGenericRawTextElement()653return true654}655p.addElement()656p.im = inHeadNoscriptIM657// Don't let the tokenizer go into raw text mode when scripting is disabled.658p.tokenizer.NextIsNotRawText()659return true660case a.Script, a.Title:661p.addElement()662p.setOriginalIM()663p.im = textIM664return true665case a.Noframes, a.Style:666p.parseGenericRawTextElement()667return true668case a.Head:669// Ignore the token.670return true671case a.Template:672// TODO: remove this divergence from the HTML5 spec.673//674// We don't handle all of the corner cases when mixing foreign675// content (i.e. <math> or <svg>) with <template>. Without this676// early return, we can get into an infinite loop, possibly because677// of the "TODO... further divergence" a little below.678//679// As a workaround, if we are mixing foreign content and templates,680// just ignore the rest of the HTML. Foreign content is rare and a681// relatively old HTML feature. Templates are also rare and a682// relatively new HTML feature. Their combination is very rare.683for _, e := range p.oe {684if e.Namespace != "" {685p.im = ignoreTheRemainingTokens686return true687}688}689690p.addElement()691p.afe = append(p.afe, &scopeMarker)692p.framesetOK = false693p.im = inTemplateIM694p.templateStack = append(p.templateStack, inTemplateIM)695return true696}697case EndTagToken:698switch p.tok.DataAtom {699case a.Head:700p.oe.pop()701p.im = afterHeadIM702return true703case a.Body, a.Html, a.Br:704p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())705return false706case a.Template:707if !p.oe.contains(a.Template) {708return true709}710// TODO: remove this further divergence from the HTML5 spec.711//712// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668713p.generateImpliedEndTags()714for i := len(p.oe) - 1; i >= 0; i-- {715if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {716p.oe = p.oe[:i]717break718}719}720p.clearActiveFormattingElements()721p.templateStack.pop()722p.resetInsertionMode()723return true724default:725// Ignore the token.726return true727}728case CommentToken:729p.addChild(&Node{730Type: CommentNode,731Data: p.tok.Data,732})733return true734case DoctypeToken:735// Ignore the token.736return true737}738739p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())740return false741}742743// Section 12.2.6.4.5.744func inHeadNoscriptIM(p *parser) bool {745switch p.tok.Type {746case DoctypeToken:747// Ignore the token.748return true749case StartTagToken:750switch p.tok.DataAtom {751case a.Html:752return inBodyIM(p)753case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:754return inHeadIM(p)755case a.Head:756// Ignore the token.757return true758case a.Noscript:759// Don't let the tokenizer go into raw text mode even when a <noscript>760// tag is in "in head noscript" insertion mode.761p.tokenizer.NextIsNotRawText()762// Ignore the token.763return true764}765case EndTagToken:766switch p.tok.DataAtom {767case a.Noscript, a.Br:768default:769// Ignore the token.770return true771}772case TextToken:773s := strings.TrimLeft(p.tok.Data, whitespace)774if len(s) == 0 {775// It was all whitespace.776return inHeadIM(p)777}778case CommentToken:779return inHeadIM(p)780}781p.oe.pop()782if p.top().DataAtom != a.Head {783panic("html: the new current node will be a head element.")784}785p.im = inHeadIM786if p.tok.DataAtom == a.Noscript {787return true788}789return false790}791792// Section 12.2.6.4.6.793func afterHeadIM(p *parser) bool {794switch p.tok.Type {795case TextToken:796s := strings.TrimLeft(p.tok.Data, whitespace)797if len(s) < len(p.tok.Data) {798// Add the initial whitespace to the current node.799p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])800if s == "" {801return true802}803p.tok.Data = s804}805case StartTagToken:806switch p.tok.DataAtom {807case a.Html:808return inBodyIM(p)809case a.Body:810p.addElement()811p.framesetOK = false812p.im = inBodyIM813return true814case a.Frameset:815p.addElement()816p.im = inFramesetIM817return true818case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:819p.insertOpenElement(p.head)820defer p.oe.remove(p.head)821return inHeadIM(p)822case a.Head:823// Ignore the token.824return true825}826case EndTagToken:827switch p.tok.DataAtom {828case a.Body, a.Html, a.Br:829// Drop down to creating an implied <body> tag.830case a.Template:831return inHeadIM(p)832default:833// Ignore the token.834return true835}836case CommentToken:837p.addChild(&Node{838Type: CommentNode,839Data: p.tok.Data,840})841return true842case DoctypeToken:843// Ignore the token.844return true845}846847p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())848p.framesetOK = true849if p.tok.Type == ErrorToken {850// Stop parsing.851return true852}853return false854}855856// copyAttributes copies attributes of src not found on dst to dst.857func copyAttributes(dst *Node, src Token) {858if len(src.Attr) == 0 {859return860}861attr := map[string]string{}862for _, t := range dst.Attr {863attr[t.Key] = t.Val864}865for _, t := range src.Attr {866if _, ok := attr[t.Key]; !ok {867dst.Attr = append(dst.Attr, t)868attr[t.Key] = t.Val869}870}871}872873// Section 12.2.6.4.7.874func inBodyIM(p *parser) bool {875switch p.tok.Type {876case TextToken:877d := p.tok.Data878switch n := p.oe.top(); n.DataAtom {879case a.Pre, a.Listing:880if n.FirstChild == nil {881// Ignore a newline at the start of a <pre> block.882if d != "" && d[0] == '\r' {883d = d[1:]884}885if d != "" && d[0] == '\n' {886d = d[1:]887}888}889}890d = strings.Replace(d, "\x00", "", -1)891if d == "" {892return true893}894p.reconstructActiveFormattingElements()895p.addText(d)896if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {897// There were non-whitespace characters inserted.898p.framesetOK = false899}900case StartTagToken:901switch p.tok.DataAtom {902case a.Html:903if p.oe.contains(a.Template) {904return true905}906copyAttributes(p.oe[0], p.tok)907case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:908return inHeadIM(p)909case a.Body:910if p.oe.contains(a.Template) {911return true912}913if len(p.oe) >= 2 {914body := p.oe[1]915if body.Type == ElementNode && body.DataAtom == a.Body {916p.framesetOK = false917copyAttributes(body, p.tok)918}919}920case a.Frameset:921if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {922// Ignore the token.923return true924}925body := p.oe[1]926if body.Parent != nil {927body.Parent.RemoveChild(body)928}929p.oe = p.oe[:1]930p.addElement()931p.im = inFramesetIM932return true933case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:934p.popUntil(buttonScope, a.P)935p.addElement()936case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:937p.popUntil(buttonScope, a.P)938switch n := p.top(); n.DataAtom {939case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:940p.oe.pop()941}942p.addElement()943case a.Pre, a.Listing:944p.popUntil(buttonScope, a.P)945p.addElement()946// The newline, if any, will be dealt with by the TextToken case.947p.framesetOK = false948case a.Form:949if p.form != nil && !p.oe.contains(a.Template) {950// Ignore the token951return true952}953p.popUntil(buttonScope, a.P)954p.addElement()955if !p.oe.contains(a.Template) {956p.form = p.top()957}958case a.Li:959p.framesetOK = false960for i := len(p.oe) - 1; i >= 0; i-- {961node := p.oe[i]962switch node.DataAtom {963case a.Li:964p.oe = p.oe[:i]965case a.Address, a.Div, a.P:966continue967default:968if !isSpecialElement(node) {969continue970}971}972break973}974p.popUntil(buttonScope, a.P)975p.addElement()976case a.Dd, a.Dt:977p.framesetOK = false978for i := len(p.oe) - 1; i >= 0; i-- {979node := p.oe[i]980switch node.DataAtom {981case a.Dd, a.Dt:982p.oe = p.oe[:i]983case a.Address, a.Div, a.P:984continue985default:986if !isSpecialElement(node) {987continue988}989}990break991}992p.popUntil(buttonScope, a.P)993p.addElement()994case a.Plaintext:995p.popUntil(buttonScope, a.P)996p.addElement()997case a.Button:998p.popUntil(defaultScope, a.Button)999p.reconstructActiveFormattingElements()1000p.addElement()1001p.framesetOK = false1002case a.A:1003for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {1004if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {1005p.inBodyEndTagFormatting(a.A, "a")1006p.oe.remove(n)1007p.afe.remove(n)1008break1009}1010}1011p.reconstructActiveFormattingElements()1012p.addFormattingElement()1013case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:1014p.reconstructActiveFormattingElements()1015p.addFormattingElement()1016case a.Nobr:1017p.reconstructActiveFormattingElements()1018if p.elementInScope(defaultScope, a.Nobr) {1019p.inBodyEndTagFormatting(a.Nobr, "nobr")1020p.reconstructActiveFormattingElements()1021}1022p.addFormattingElement()1023case a.Applet, a.Marquee, a.Object:1024p.reconstructActiveFormattingElements()1025p.addElement()1026p.afe = append(p.afe, &scopeMarker)1027p.framesetOK = false1028case a.Table:1029if !p.quirks {1030p.popUntil(buttonScope, a.P)1031}1032p.addElement()1033p.framesetOK = false1034p.im = inTableIM1035return true1036case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:1037p.reconstructActiveFormattingElements()1038p.addElement()1039p.oe.pop()1040p.acknowledgeSelfClosingTag()1041if p.tok.DataAtom == a.Input {1042for _, t := range p.tok.Attr {1043if t.Key == "type" {1044if strings.EqualFold(t.Val, "hidden") {1045// Skip setting framesetOK = false1046return true1047}1048}1049}1050}1051p.framesetOK = false1052case a.Param, a.Source, a.Track:1053p.addElement()1054p.oe.pop()1055p.acknowledgeSelfClosingTag()1056case a.Hr:1057p.popUntil(buttonScope, a.P)1058p.addElement()1059p.oe.pop()1060p.acknowledgeSelfClosingTag()1061p.framesetOK = false1062case a.Image:1063p.tok.DataAtom = a.Img1064p.tok.Data = a.Img.String()1065return false1066case a.Textarea:1067p.addElement()1068p.setOriginalIM()1069p.framesetOK = false1070p.im = textIM1071case a.Xmp:1072p.popUntil(buttonScope, a.P)1073p.reconstructActiveFormattingElements()1074p.framesetOK = false1075p.parseGenericRawTextElement()1076case a.Iframe:1077p.framesetOK = false1078p.parseGenericRawTextElement()1079case a.Noembed:1080p.parseGenericRawTextElement()1081case a.Noscript:1082if p.scripting {1083p.parseGenericRawTextElement()1084return true1085}1086p.reconstructActiveFormattingElements()1087p.addElement()1088// Don't let the tokenizer go into raw text mode when scripting is disabled.1089p.tokenizer.NextIsNotRawText()1090case a.Select:1091p.reconstructActiveFormattingElements()1092p.addElement()1093p.framesetOK = false1094p.im = inSelectIM1095return true1096case a.Optgroup, a.Option:1097if p.top().DataAtom == a.Option {1098p.oe.pop()1099}1100p.reconstructActiveFormattingElements()1101p.addElement()1102case a.Rb, a.Rtc:1103if p.elementInScope(defaultScope, a.Ruby) {1104p.generateImpliedEndTags()1105}1106p.addElement()1107case a.Rp, a.Rt:1108if p.elementInScope(defaultScope, a.Ruby) {1109p.generateImpliedEndTags("rtc")1110}1111p.addElement()1112case a.Math, a.Svg:1113p.reconstructActiveFormattingElements()1114if p.tok.DataAtom == a.Math {1115adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)1116} else {1117adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)1118}1119adjustForeignAttributes(p.tok.Attr)1120p.addElement()1121p.top().Namespace = p.tok.Data1122if p.hasSelfClosingToken {1123p.oe.pop()1124p.acknowledgeSelfClosingTag()1125}1126return true1127case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:1128// Ignore the token.1129default:1130p.reconstructActiveFormattingElements()1131p.addElement()1132}1133case EndTagToken:1134switch p.tok.DataAtom {1135case a.Body:1136if p.elementInScope(defaultScope, a.Body) {1137p.im = afterBodyIM1138}1139case a.Html:1140if p.elementInScope(defaultScope, a.Body) {1141p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())1142return false1143}1144return true1145case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:1146p.popUntil(defaultScope, p.tok.DataAtom)1147case a.Form:1148if p.oe.contains(a.Template) {1149i := p.indexOfElementInScope(defaultScope, a.Form)1150if i == -1 {1151// Ignore the token.1152return true1153}1154p.generateImpliedEndTags()1155if p.oe[i].DataAtom != a.Form {1156// Ignore the token.1157return true1158}1159p.popUntil(defaultScope, a.Form)1160} else {1161node := p.form1162p.form = nil1163i := p.indexOfElementInScope(defaultScope, a.Form)1164if node == nil || i == -1 || p.oe[i] != node {1165// Ignore the token.1166return true1167}1168p.generateImpliedEndTags()1169p.oe.remove(node)1170}1171case a.P:1172if !p.elementInScope(buttonScope, a.P) {1173p.parseImpliedToken(StartTagToken, a.P, a.P.String())1174}1175p.popUntil(buttonScope, a.P)1176case a.Li:1177p.popUntil(listItemScope, a.Li)1178case a.Dd, a.Dt:1179p.popUntil(defaultScope, p.tok.DataAtom)1180case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:1181p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)1182case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:1183p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)1184case a.Applet, a.Marquee, a.Object:1185if p.popUntil(defaultScope, p.tok.DataAtom) {1186p.clearActiveFormattingElements()1187}1188case a.Br:1189p.tok.Type = StartTagToken1190return false1191case a.Template:1192return inHeadIM(p)1193default:1194p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)1195}1196case CommentToken:1197p.addChild(&Node{1198Type: CommentNode,1199Data: p.tok.Data,1200})1201case ErrorToken:1202// TODO: remove this divergence from the HTML5 spec.1203if len(p.templateStack) > 0 {1204p.im = inTemplateIM1205return false1206}1207for _, e := range p.oe {1208switch e.DataAtom {1209case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,1210a.Thead, a.Tr, a.Body, a.Html:1211default:1212return true1213}1214}1215}12161217return true1218}12191220func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {1221// This is the "adoption agency" algorithm, described at1222// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency12231224// TODO: this is a fairly literal line-by-line translation of that algorithm.1225// Once the code successfully parses the comprehensive test suite, we should1226// refactor this code to be more idiomatic.12271228// Steps 1-21229if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {1230p.oe.pop()1231return1232}12331234// Steps 3-5. The outer loop.1235for i := 0; i < 8; i++ {1236// Step 6. Find the formatting element.1237var formattingElement *Node1238for j := len(p.afe) - 1; j >= 0; j-- {1239if p.afe[j].Type == scopeMarkerNode {1240break1241}1242if p.afe[j].DataAtom == tagAtom {1243formattingElement = p.afe[j]1244break1245}1246}1247if formattingElement == nil {1248p.inBodyEndTagOther(tagAtom, tagName)1249return1250}12511252// Step 7. Ignore the tag if formatting element is not in the stack of open elements.1253feIndex := p.oe.index(formattingElement)1254if feIndex == -1 {1255p.afe.remove(formattingElement)1256return1257}1258// Step 8. Ignore the tag if formatting element is not in the scope.1259if !p.elementInScope(defaultScope, tagAtom) {1260// Ignore the tag.1261return1262}12631264// Step 9. This step is omitted because it's just a parse error but no need to return.12651266// Steps 10-11. Find the furthest block.1267var furthestBlock *Node1268for _, e := range p.oe[feIndex:] {1269if isSpecialElement(e) {1270furthestBlock = e1271break1272}1273}1274if furthestBlock == nil {1275e := p.oe.pop()1276for e != formattingElement {1277e = p.oe.pop()1278}1279p.afe.remove(e)1280return1281}12821283// Steps 12-13. Find the common ancestor and bookmark node.1284commonAncestor := p.oe[feIndex-1]1285bookmark := p.afe.index(formattingElement)12861287// Step 14. The inner loop. Find the lastNode to reparent.1288lastNode := furthestBlock1289node := furthestBlock1290x := p.oe.index(node)1291// Step 14.1.1292j := 01293for {1294// Step 14.2.1295j++1296// Step. 14.3.1297x--1298node = p.oe[x]1299// Step 14.4. Go to the next step if node is formatting element.1300if node == formattingElement {1301break1302}1303// Step 14.5. Remove node from the list of active formatting elements if1304// inner loop counter is greater than three and node is in the list of1305// active formatting elements.1306if ni := p.afe.index(node); j > 3 && ni > -1 {1307p.afe.remove(node)1308// If any element of the list of active formatting elements is removed,1309// we need to take care whether bookmark should be decremented or not.1310// This is because the value of bookmark may exceed the size of the1311// list by removing elements from the list.1312if ni <= bookmark {1313bookmark--1314}1315continue1316}1317// Step 14.6. Continue the next inner loop if node is not in the list of1318// active formatting elements.1319if p.afe.index(node) == -1 {1320p.oe.remove(node)1321continue1322}1323// Step 14.7.1324clone := node.clone()1325p.afe[p.afe.index(node)] = clone1326p.oe[p.oe.index(node)] = clone1327node = clone1328// Step 14.8.1329if lastNode == furthestBlock {1330bookmark = p.afe.index(node) + 11331}1332// Step 14.9.1333if lastNode.Parent != nil {1334lastNode.Parent.RemoveChild(lastNode)1335}1336node.AppendChild(lastNode)1337// Step 14.10.1338lastNode = node1339}13401341// Step 15. Reparent lastNode to the common ancestor,1342// or for misnested table nodes, to the foster parent.1343if lastNode.Parent != nil {1344lastNode.Parent.RemoveChild(lastNode)1345}1346switch commonAncestor.DataAtom {1347case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:1348p.fosterParent(lastNode)1349default:1350commonAncestor.AppendChild(lastNode)1351}13521353// Steps 16-18. Reparent nodes from the furthest block's children1354// to a clone of the formatting element.1355clone := formattingElement.clone()1356reparentChildren(clone, furthestBlock)1357furthestBlock.AppendChild(clone)13581359// Step 19. Fix up the list of active formatting elements.1360if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {1361// Move the bookmark with the rest of the list.1362bookmark--1363}1364p.afe.remove(formattingElement)1365p.afe.insert(bookmark, clone)13661367// Step 20. Fix up the stack of open elements.1368p.oe.remove(formattingElement)1369p.oe.insert(p.oe.index(furthestBlock)+1, clone)1370}1371}13721373// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.1374// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content1375// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign1376func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {1377for i := len(p.oe) - 1; i >= 0; i-- {1378// Two element nodes have the same tag if they have the same Data (a1379// string-typed field). As an optimization, for common HTML tags, each1380// Data string is assigned a unique, non-zero DataAtom (a uint32-typed1381// field), since integer comparison is faster than string comparison.1382// Uncommon (custom) tags get a zero DataAtom.1383//1384// The if condition here is equivalent to (p.oe[i].Data == tagName).1385if (p.oe[i].DataAtom == tagAtom) &&1386((tagAtom != 0) || (p.oe[i].Data == tagName)) {1387p.oe = p.oe[:i]1388break1389}1390if isSpecialElement(p.oe[i]) {1391break1392}1393}1394}13951396// Section 12.2.6.4.8.1397func textIM(p *parser) bool {1398switch p.tok.Type {1399case ErrorToken:1400p.oe.pop()1401case TextToken:1402d := p.tok.Data1403if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {1404// Ignore a newline at the start of a <textarea> block.1405if d != "" && d[0] == '\r' {1406d = d[1:]1407}1408if d != "" && d[0] == '\n' {1409d = d[1:]1410}1411}1412if d == "" {1413return true1414}1415p.addText(d)1416return true1417case EndTagToken:1418p.oe.pop()1419}1420p.im = p.originalIM1421p.originalIM = nil1422return p.tok.Type == EndTagToken1423}14241425// Section 12.2.6.4.9.1426func inTableIM(p *parser) bool {1427switch p.tok.Type {1428case TextToken:1429p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)1430switch p.oe.top().DataAtom {1431case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:1432if strings.Trim(p.tok.Data, whitespace) == "" {1433p.addText(p.tok.Data)1434return true1435}1436}1437case StartTagToken:1438switch p.tok.DataAtom {1439case a.Caption:1440p.clearStackToContext(tableScope)1441p.afe = append(p.afe, &scopeMarker)1442p.addElement()1443p.im = inCaptionIM1444return true1445case a.Colgroup:1446p.clearStackToContext(tableScope)1447p.addElement()1448p.im = inColumnGroupIM1449return true1450case a.Col:1451p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())1452return false1453case a.Tbody, a.Tfoot, a.Thead:1454p.clearStackToContext(tableScope)1455p.addElement()1456p.im = inTableBodyIM1457return true1458case a.Td, a.Th, a.Tr:1459p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())1460return false1461case a.Table:1462if p.popUntil(tableScope, a.Table) {1463p.resetInsertionMode()1464return false1465}1466// Ignore the token.1467return true1468case a.Style, a.Script, a.Template:1469return inHeadIM(p)1470case a.Input:1471for _, t := range p.tok.Attr {1472if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {1473p.addElement()1474p.oe.pop()1475return true1476}1477}1478// Otherwise drop down to the default action.1479case a.Form:1480if p.oe.contains(a.Template) || p.form != nil {1481// Ignore the token.1482return true1483}1484p.addElement()1485p.form = p.oe.pop()1486case a.Select:1487p.reconstructActiveFormattingElements()1488switch p.top().DataAtom {1489case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:1490p.fosterParenting = true1491}1492p.addElement()1493p.fosterParenting = false1494p.framesetOK = false1495p.im = inSelectInTableIM1496return true1497}1498case EndTagToken:1499switch p.tok.DataAtom {1500case a.Table:1501if p.popUntil(tableScope, a.Table) {1502p.resetInsertionMode()1503return true1504}1505// Ignore the token.1506return true1507case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:1508// Ignore the token.1509return true1510case a.Template:1511return inHeadIM(p)1512}1513case CommentToken:1514p.addChild(&Node{1515Type: CommentNode,1516Data: p.tok.Data,1517})1518return true1519case DoctypeToken:1520// Ignore the token.1521return true1522case ErrorToken:1523return inBodyIM(p)1524}15251526p.fosterParenting = true1527defer func() { p.fosterParenting = false }()15281529return inBodyIM(p)1530}15311532// Section 12.2.6.4.11.1533func inCaptionIM(p *parser) bool {1534switch p.tok.Type {1535case StartTagToken:1536switch p.tok.DataAtom {1537case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:1538if !p.popUntil(tableScope, a.Caption) {1539// Ignore the token.1540return true1541}1542p.clearActiveFormattingElements()1543p.im = inTableIM1544return false1545case a.Select:1546p.reconstructActiveFormattingElements()1547p.addElement()1548p.framesetOK = false1549p.im = inSelectInTableIM1550return true1551}1552case EndTagToken:1553switch p.tok.DataAtom {1554case a.Caption:1555if p.popUntil(tableScope, a.Caption) {1556p.clearActiveFormattingElements()1557p.im = inTableIM1558}1559return true1560case a.Table:1561if !p.popUntil(tableScope, a.Caption) {1562// Ignore the token.1563return true1564}1565p.clearActiveFormattingElements()1566p.im = inTableIM1567return false1568case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:1569// Ignore the token.1570return true1571}1572}1573return inBodyIM(p)1574}15751576// Section 12.2.6.4.12.1577func inColumnGroupIM(p *parser) bool {1578switch p.tok.Type {1579case TextToken:1580s := strings.TrimLeft(p.tok.Data, whitespace)1581if len(s) < len(p.tok.Data) {1582// Add the initial whitespace to the current node.1583p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])1584if s == "" {1585return true1586}1587p.tok.Data = s1588}1589case CommentToken:1590p.addChild(&Node{1591Type: CommentNode,1592Data: p.tok.Data,1593})1594return true1595case DoctypeToken:1596// Ignore the token.1597return true1598case StartTagToken:1599switch p.tok.DataAtom {1600case a.Html:1601return inBodyIM(p)1602case a.Col:1603p.addElement()1604p.oe.pop()1605p.acknowledgeSelfClosingTag()1606return true1607case a.Template:1608return inHeadIM(p)1609}1610case EndTagToken:1611switch p.tok.DataAtom {1612case a.Colgroup:1613if p.oe.top().DataAtom == a.Colgroup {1614p.oe.pop()1615p.im = inTableIM1616}1617return true1618case a.Col:1619// Ignore the token.1620return true1621case a.Template:1622return inHeadIM(p)1623}1624case ErrorToken:1625return inBodyIM(p)1626}1627if p.oe.top().DataAtom != a.Colgroup {1628return true1629}1630p.oe.pop()1631p.im = inTableIM1632return false1633}16341635// Section 12.2.6.4.13.1636func inTableBodyIM(p *parser) bool {1637switch p.tok.Type {1638case StartTagToken:1639switch p.tok.DataAtom {1640case a.Tr:1641p.clearStackToContext(tableBodyScope)1642p.addElement()1643p.im = inRowIM1644return true1645case a.Td, a.Th:1646p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())1647return false1648case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:1649if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {1650p.im = inTableIM1651return false1652}1653// Ignore the token.1654return true1655}1656case EndTagToken:1657switch p.tok.DataAtom {1658case a.Tbody, a.Tfoot, a.Thead:1659if p.elementInScope(tableScope, p.tok.DataAtom) {1660p.clearStackToContext(tableBodyScope)1661p.oe.pop()1662p.im = inTableIM1663}1664return true1665case a.Table:1666if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {1667p.im = inTableIM1668return false1669}1670// Ignore the token.1671return true1672case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:1673// Ignore the token.1674return true1675}1676case CommentToken:1677p.addChild(&Node{1678Type: CommentNode,1679Data: p.tok.Data,1680})1681return true1682}16831684return inTableIM(p)1685}16861687// Section 13.2.6.4.14.1688func inRowIM(p *parser) bool {1689switch p.tok.Type {1690case StartTagToken:1691switch p.tok.DataAtom {1692case a.Td, a.Th:1693p.clearStackToContext(tableRowScope)1694p.addElement()1695p.afe = append(p.afe, &scopeMarker)1696p.im = inCellIM1697return true1698case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:1699if p.elementInScope(tableScope, a.Tr) {1700p.clearStackToContext(tableRowScope)1701p.oe.pop()1702p.im = inTableBodyIM1703return false1704}1705// Ignore the token.1706return true1707}1708case EndTagToken:1709switch p.tok.DataAtom {1710case a.Tr:1711if p.elementInScope(tableScope, a.Tr) {1712p.clearStackToContext(tableRowScope)1713p.oe.pop()1714p.im = inTableBodyIM1715return true1716}1717// Ignore the token.1718return true1719case a.Table:1720if p.elementInScope(tableScope, a.Tr) {1721p.clearStackToContext(tableRowScope)1722p.oe.pop()1723p.im = inTableBodyIM1724return false1725}1726// Ignore the token.1727return true1728case a.Tbody, a.Tfoot, a.Thead:1729if p.elementInScope(tableScope, p.tok.DataAtom) && p.elementInScope(tableScope, a.Tr) {1730p.clearStackToContext(tableRowScope)1731p.oe.pop()1732p.im = inTableBodyIM1733return false1734}1735// Ignore the token.1736return true1737case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:1738// Ignore the token.1739return true1740}1741}17421743return inTableIM(p)1744}17451746// Section 12.2.6.4.15.1747func inCellIM(p *parser) bool {1748switch p.tok.Type {1749case StartTagToken:1750switch p.tok.DataAtom {1751case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:1752if p.popUntil(tableScope, a.Td, a.Th) {1753// Close the cell and reprocess.1754p.clearActiveFormattingElements()1755p.im = inRowIM1756return false1757}1758// Ignore the token.1759return true1760case a.Select:1761p.reconstructActiveFormattingElements()1762p.addElement()1763p.framesetOK = false1764p.im = inSelectInTableIM1765return true1766}1767case EndTagToken:1768switch p.tok.DataAtom {1769case a.Td, a.Th:1770if !p.popUntil(tableScope, p.tok.DataAtom) {1771// Ignore the token.1772return true1773}1774p.clearActiveFormattingElements()1775p.im = inRowIM1776return true1777case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:1778// Ignore the token.1779return true1780case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:1781if !p.elementInScope(tableScope, p.tok.DataAtom) {1782// Ignore the token.1783return true1784}1785// Close the cell and reprocess.1786if p.popUntil(tableScope, a.Td, a.Th) {1787p.clearActiveFormattingElements()1788}1789p.im = inRowIM1790return false1791}1792}1793return inBodyIM(p)1794}17951796// Section 12.2.6.4.16.1797func inSelectIM(p *parser) bool {1798switch p.tok.Type {1799case TextToken:1800p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))1801case StartTagToken:1802switch p.tok.DataAtom {1803case a.Html:1804return inBodyIM(p)1805case a.Option:1806if p.top().DataAtom == a.Option {1807p.oe.pop()1808}1809p.addElement()1810case a.Optgroup:1811if p.top().DataAtom == a.Option {1812p.oe.pop()1813}1814if p.top().DataAtom == a.Optgroup {1815p.oe.pop()1816}1817p.addElement()1818case a.Select:1819if !p.popUntil(selectScope, a.Select) {1820// Ignore the token.1821return true1822}1823p.resetInsertionMode()1824case a.Input, a.Keygen, a.Textarea:1825if p.elementInScope(selectScope, a.Select) {1826p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())1827return false1828}1829// In order to properly ignore <textarea>, we need to change the tokenizer mode.1830p.tokenizer.NextIsNotRawText()1831// Ignore the token.1832return true1833case a.Script, a.Template:1834return inHeadIM(p)1835case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:1836// Don't let the tokenizer go into raw text mode when there are raw tags1837// to be ignored. These tags should be ignored from the tokenizer1838// properly.1839p.tokenizer.NextIsNotRawText()1840// Ignore the token.1841return true1842}1843case EndTagToken:1844switch p.tok.DataAtom {1845case a.Option:1846if p.top().DataAtom == a.Option {1847p.oe.pop()1848}1849case a.Optgroup:1850i := len(p.oe) - 11851if p.oe[i].DataAtom == a.Option {1852i--1853}1854if p.oe[i].DataAtom == a.Optgroup {1855p.oe = p.oe[:i]1856}1857case a.Select:1858if !p.popUntil(selectScope, a.Select) {1859// Ignore the token.1860return true1861}1862p.resetInsertionMode()1863case a.Template:1864return inHeadIM(p)1865}1866case CommentToken:1867p.addChild(&Node{1868Type: CommentNode,1869Data: p.tok.Data,1870})1871case DoctypeToken:1872// Ignore the token.1873return true1874case ErrorToken:1875return inBodyIM(p)1876}18771878return true1879}18801881// Section 12.2.6.4.17.1882func inSelectInTableIM(p *parser) bool {1883switch p.tok.Type {1884case StartTagToken, EndTagToken:1885switch p.tok.DataAtom {1886case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:1887if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {1888// Ignore the token.1889return true1890}1891// This is like p.popUntil(selectScope, a.Select), but it also1892// matches <math select>, not just <select>. Matching the MathML1893// tag is arguably incorrect (conceptually), but it mimics what1894// Chromium does.1895for i := len(p.oe) - 1; i >= 0; i-- {1896if n := p.oe[i]; n.DataAtom == a.Select {1897p.oe = p.oe[:i]1898break1899}1900}1901p.resetInsertionMode()1902return false1903}1904}1905return inSelectIM(p)1906}19071908// Section 12.2.6.4.18.1909func inTemplateIM(p *parser) bool {1910switch p.tok.Type {1911case TextToken, CommentToken, DoctypeToken:1912return inBodyIM(p)1913case StartTagToken:1914switch p.tok.DataAtom {1915case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:1916return inHeadIM(p)1917case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:1918p.templateStack.pop()1919p.templateStack = append(p.templateStack, inTableIM)1920p.im = inTableIM1921return false1922case a.Col:1923p.templateStack.pop()1924p.templateStack = append(p.templateStack, inColumnGroupIM)1925p.im = inColumnGroupIM1926return false1927case a.Tr:1928p.templateStack.pop()1929p.templateStack = append(p.templateStack, inTableBodyIM)1930p.im = inTableBodyIM1931return false1932case a.Td, a.Th:1933p.templateStack.pop()1934p.templateStack = append(p.templateStack, inRowIM)1935p.im = inRowIM1936return false1937default:1938p.templateStack.pop()1939p.templateStack = append(p.templateStack, inBodyIM)1940p.im = inBodyIM1941return false1942}1943case EndTagToken:1944switch p.tok.DataAtom {1945case a.Template:1946return inHeadIM(p)1947default:1948// Ignore the token.1949return true1950}1951case ErrorToken:1952if !p.oe.contains(a.Template) {1953// Ignore the token.1954return true1955}1956// TODO: remove this divergence from the HTML5 spec.1957//1958// See https://bugs.chromium.org/p/chromium/issues/detail?id=8296681959p.generateImpliedEndTags()1960for i := len(p.oe) - 1; i >= 0; i-- {1961if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {1962p.oe = p.oe[:i]1963break1964}1965}1966p.clearActiveFormattingElements()1967p.templateStack.pop()1968p.resetInsertionMode()1969return false1970}1971return false1972}19731974// Section 12.2.6.4.19.1975func afterBodyIM(p *parser) bool {1976switch p.tok.Type {1977case ErrorToken:1978// Stop parsing.1979return true1980case TextToken:1981s := strings.TrimLeft(p.tok.Data, whitespace)1982if len(s) == 0 {1983// It was all whitespace.1984return inBodyIM(p)1985}1986case StartTagToken:1987if p.tok.DataAtom == a.Html {1988return inBodyIM(p)1989}1990case EndTagToken:1991if p.tok.DataAtom == a.Html {1992if !p.fragment {1993p.im = afterAfterBodyIM1994}1995return true1996}1997case CommentToken:1998// The comment is attached to the <html> element.1999if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {2000panic("html: bad parser state: <html> element not found, in the after-body insertion mode")2001}2002p.oe[0].AppendChild(&Node{2003Type: CommentNode,2004Data: p.tok.Data,2005})2006return true2007}2008p.im = inBodyIM2009return false2010}20112012// Section 12.2.6.4.20.2013func inFramesetIM(p *parser) bool {2014switch p.tok.Type {2015case CommentToken:2016p.addChild(&Node{2017Type: CommentNode,2018Data: p.tok.Data,2019})2020case TextToken:2021// Ignore all text but whitespace.2022s := strings.Map(func(c rune) rune {2023switch c {2024case ' ', '\t', '\n', '\f', '\r':2025return c2026}2027return -12028}, p.tok.Data)2029if s != "" {2030p.addText(s)2031}2032case StartTagToken:2033switch p.tok.DataAtom {2034case a.Html:2035return inBodyIM(p)2036case a.Frameset:2037p.addElement()2038case a.Frame:2039p.addElement()2040p.oe.pop()2041p.acknowledgeSelfClosingTag()2042case a.Noframes:2043return inHeadIM(p)2044}2045case EndTagToken:2046switch p.tok.DataAtom {2047case a.Frameset:2048if p.oe.top().DataAtom != a.Html {2049p.oe.pop()2050if p.oe.top().DataAtom != a.Frameset {2051p.im = afterFramesetIM2052return true2053}2054}2055}2056default:2057// Ignore the token.2058}2059return true2060}20612062// Section 12.2.6.4.21.2063func afterFramesetIM(p *parser) bool {2064switch p.tok.Type {2065case CommentToken:2066p.addChild(&Node{2067Type: CommentNode,2068Data: p.tok.Data,2069})2070case TextToken:2071// Ignore all text but whitespace.2072s := strings.Map(func(c rune) rune {2073switch c {2074case ' ', '\t', '\n', '\f', '\r':2075return c2076}2077return -12078}, p.tok.Data)2079if s != "" {2080p.addText(s)2081}2082case StartTagToken:2083switch p.tok.DataAtom {2084case a.Html:2085return inBodyIM(p)2086case a.Noframes:2087return inHeadIM(p)2088}2089case EndTagToken:2090switch p.tok.DataAtom {2091case a.Html:2092p.im = afterAfterFramesetIM2093return true2094}2095default:2096// Ignore the token.2097}2098return true2099}21002101// Section 12.2.6.4.22.2102func afterAfterBodyIM(p *parser) bool {2103switch p.tok.Type {2104case ErrorToken:2105// Stop parsing.2106return true2107case TextToken:2108s := strings.TrimLeft(p.tok.Data, whitespace)2109if len(s) == 0 {2110// It was all whitespace.2111return inBodyIM(p)2112}2113case StartTagToken:2114if p.tok.DataAtom == a.Html {2115return inBodyIM(p)2116}2117case CommentToken:2118p.doc.AppendChild(&Node{2119Type: CommentNode,2120Data: p.tok.Data,2121})2122return true2123case DoctypeToken:2124return inBodyIM(p)2125}2126p.im = inBodyIM2127return false2128}21292130// Section 12.2.6.4.23.2131func afterAfterFramesetIM(p *parser) bool {2132switch p.tok.Type {2133case CommentToken:2134p.doc.AppendChild(&Node{2135Type: CommentNode,2136Data: p.tok.Data,2137})2138case TextToken:2139// Ignore all text but whitespace.2140s := strings.Map(func(c rune) rune {2141switch c {2142case ' ', '\t', '\n', '\f', '\r':2143return c2144}2145return -12146}, p.tok.Data)2147if s != "" {2148p.tok.Data = s2149return inBodyIM(p)2150}2151case StartTagToken:2152switch p.tok.DataAtom {2153case a.Html:2154return inBodyIM(p)2155case a.Noframes:2156return inHeadIM(p)2157}2158case DoctypeToken:2159return inBodyIM(p)2160default:2161// Ignore the token.2162}2163return true2164}21652166func ignoreTheRemainingTokens(p *parser) bool {2167return true2168}21692170const whitespaceOrNUL = whitespace + "\x00"21712172// Section 12.2.6.52173func parseForeignContent(p *parser) bool {2174switch p.tok.Type {2175case TextToken:2176if p.framesetOK {2177p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""2178}2179p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)2180p.addText(p.tok.Data)2181case CommentToken:2182p.addChild(&Node{2183Type: CommentNode,2184Data: p.tok.Data,2185})2186case StartTagToken:2187if !p.fragment {2188b := breakout[p.tok.Data]2189if p.tok.DataAtom == a.Font {2190loop:2191for _, attr := range p.tok.Attr {2192switch attr.Key {2193case "color", "face", "size":2194b = true2195break loop2196}2197}2198}2199if b {2200for i := len(p.oe) - 1; i >= 0; i-- {2201n := p.oe[i]2202if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {2203p.oe = p.oe[:i+1]2204break2205}2206}2207return false2208}2209}2210current := p.adjustedCurrentNode()2211switch current.Namespace {2212case "math":2213adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)2214case "svg":2215// Adjust SVG tag names. The tokenizer lower-cases tag names, but2216// SVG wants e.g. "foreignObject" with a capital second "O".2217if x := svgTagNameAdjustments[p.tok.Data]; x != "" {2218p.tok.DataAtom = a.Lookup([]byte(x))2219p.tok.Data = x2220}2221adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)2222default:2223panic("html: bad parser state: unexpected namespace")2224}2225adjustForeignAttributes(p.tok.Attr)2226namespace := current.Namespace2227p.addElement()2228p.top().Namespace = namespace2229if namespace != "" {2230// Don't let the tokenizer go into raw text mode in foreign content2231// (e.g. in an SVG <title> tag).2232p.tokenizer.NextIsNotRawText()2233}2234if p.hasSelfClosingToken {2235p.oe.pop()2236p.acknowledgeSelfClosingTag()2237}2238case EndTagToken:2239if strings.EqualFold(p.oe[len(p.oe)-1].Data, p.tok.Data) {2240p.oe = p.oe[:len(p.oe)-1]2241return true2242}2243for i := len(p.oe) - 1; i >= 0; i-- {2244if strings.EqualFold(p.oe[i].Data, p.tok.Data) {2245p.oe = p.oe[:i]2246return true2247}2248if i > 0 && p.oe[i-1].Namespace == "" {2249break2250}2251}2252return p.im(p)2253default:2254// Ignore the token.2255}2256return true2257}22582259// Section 12.2.4.2.2260func (p *parser) adjustedCurrentNode() *Node {2261if len(p.oe) == 1 && p.fragment && p.context != nil {2262return p.context2263}2264return p.oe.top()2265}22662267// Section 12.2.6.2268func (p *parser) inForeignContent() bool {2269if len(p.oe) == 0 {2270return false2271}2272n := p.adjustedCurrentNode()2273if n.Namespace == "" {2274return false2275}2276if mathMLTextIntegrationPoint(n) {2277if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {2278return false2279}2280if p.tok.Type == TextToken {2281return false2282}2283}2284if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {2285return false2286}2287if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {2288return false2289}2290if p.tok.Type == ErrorToken {2291return false2292}2293return true2294}22952296// parseImpliedToken parses a token as though it had appeared in the parser's2297// input.2298func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {2299realToken, selfClosing := p.tok, p.hasSelfClosingToken2300p.tok = Token{2301Type: t,2302DataAtom: dataAtom,2303Data: data,2304}2305p.hasSelfClosingToken = false2306p.parseCurrentToken()2307p.tok, p.hasSelfClosingToken = realToken, selfClosing2308}23092310// parseCurrentToken runs the current token through the parsing routines2311// until it is consumed.2312func (p *parser) parseCurrentToken() {2313if p.tok.Type == SelfClosingTagToken {2314p.hasSelfClosingToken = true2315p.tok.Type = StartTagToken2316}23172318consumed := false2319for !consumed {2320if p.inForeignContent() {2321consumed = parseForeignContent(p)2322} else {2323consumed = p.im(p)2324}2325}23262327if p.hasSelfClosingToken {2328// This is a parse error, but ignore it.2329p.hasSelfClosingToken = false2330}2331}23322333func (p *parser) parse() (err error) {2334defer func() {2335if panicErr := recover(); panicErr != nil {2336err = fmt.Errorf("%s", panicErr)2337}2338}()2339// Iterate until EOF. Any other error will cause an early return.2340for err != io.EOF {2341// CDATA sections are allowed only in foreign content.2342n := p.oe.top()2343p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")2344// Read and parse the next token.2345p.tokenizer.Next()2346p.tok = p.tokenizer.Token()2347if p.tok.Type == ErrorToken {2348err = p.tokenizer.Err()2349if err != nil && err != io.EOF {2350return err2351}2352}2353p.parseCurrentToken()2354}2355return nil2356}23572358// Parse returns the parse tree for the HTML from the given Reader.2359//2360// It implements the HTML5 parsing algorithm2361// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),2362// which is very complicated. The resultant tree can contain implicitly created2363// nodes that have no explicit <tag> listed in r's data, and nodes' parents can2364// differ from the nesting implied by a naive processing of start and end2365// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,2366// with no corresponding node in the resulting tree.2367//2368// Parse will reject HTML that is nested deeper than 512 elements.2369//2370// The input is assumed to be UTF-8 encoded.2371func Parse(r io.Reader) (*Node, error) {2372return ParseWithOptions(r)2373}23742375// ParseFragment parses a fragment of HTML and returns the nodes that were2376// found. If the fragment is the InnerHTML for an existing element, pass that2377// element in context.2378//2379// It has the same intricacies as Parse.2380func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {2381return ParseFragmentWithOptions(r, context)2382}23832384// ParseOption configures a parser.2385type ParseOption func(p *parser)23862387// ParseOptionEnableScripting configures the scripting flag.2388// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting2389//2390// By default, scripting is enabled.2391func ParseOptionEnableScripting(enable bool) ParseOption {2392return func(p *parser) {2393p.scripting = enable2394}2395}23962397// ParseWithOptions is like Parse, with options.2398func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {2399p := &parser{2400tokenizer: NewTokenizer(r),2401doc: &Node{2402Type: DocumentNode,2403},2404scripting: true,2405framesetOK: true,2406im: initialIM,2407}24082409for _, f := range opts {2410f(p)2411}24122413if err := p.parse(); err != nil {2414return nil, err2415}2416return p.doc, nil2417}24182419// ParseFragmentWithOptions is like ParseFragment, with options.2420func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {2421contextTag := ""2422if context != nil {2423if context.Type != ElementNode {2424return nil, errors.New("html: ParseFragment of non-element Node")2425}2426// The next check isn't just context.DataAtom.String() == context.Data because2427// it is valid to pass an element whose tag isn't a known atom. For example,2428// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.2429if context.DataAtom != a.Lookup([]byte(context.Data)) {2430return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)2431}2432contextTag = context.DataAtom.String()2433}2434p := &parser{2435doc: &Node{2436Type: DocumentNode,2437},2438scripting: true,2439fragment: true,2440context: context,2441}2442if context != nil && context.Namespace != "" {2443p.tokenizer = NewTokenizer(r)2444} else {2445p.tokenizer = NewTokenizerFragment(r, contextTag)2446}24472448for _, f := range opts {2449f(p)2450}24512452root := &Node{2453Type: ElementNode,2454DataAtom: a.Html,2455Data: a.Html.String(),2456}2457p.doc.AppendChild(root)2458p.oe = nodeStack{root}2459if context != nil && context.DataAtom == a.Template {2460p.templateStack = append(p.templateStack, inTemplateIM)2461}2462p.resetInsertionMode()24632464for n := context; n != nil; n = n.Parent {2465if n.Type == ElementNode && n.DataAtom == a.Form {2466p.form = n2467break2468}2469}24702471if err := p.parse(); err != nil {2472return nil, err2473}24742475parent := p.doc2476if context != nil {2477parent = root2478}24792480var result []*Node2481for c := parent.FirstChild; c != nil; {2482next := c.NextSibling2483parent.RemoveChild(c)2484result = append(result, c)2485c = next2486}2487return result, nil2488}248924902491