CoCalc -- parse.go

GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/vendor/golang.org/x/net/html/parse.go
²⁸⁸⁰ views
1
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4

5
package html
6

7
import (
8
	"errors"
9
	"fmt"
10
	"io"
11
	"strings"
12

13
	a "golang.org/x/net/html/atom"
14
)
15

16
// A parser implements the HTML5 parsing algorithm:
17
// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18
type parser struct {
19
	// tokenizer provides the tokens for the parser.
20
	tokenizer *Tokenizer
21
	// tok is the most recently read token.
22
	tok Token
23
	// Self-closing tags like <hr/> are treated as start tags, except that
24
	// hasSelfClosingToken is set while they are being processed.
25
	hasSelfClosingToken bool
26
	// doc is the document root element.
27
	doc *Node
28
	// The stack of open elements (section 12.2.4.2) and active formatting
29
	// elements (section 12.2.4.3).
30
	oe, afe nodeStack
31
	// Element pointers (section 12.2.4.4).
32
	head, form *Node
33
	// Other parsing state flags (section 12.2.4.5).
34
	scripting, framesetOK bool
35
	// The stack of template insertion modes
36
	templateStack insertionModeStack
37
	// im is the current insertion mode.
38
	im insertionMode
39
	// originalIM is the insertion mode to go back to after completing a text
40
	// or inTableText insertion mode.
41
	originalIM insertionMode
42
	// fosterParenting is whether new elements should be inserted according to
43
	// the foster parenting rules (section 12.2.6.1).
44
	fosterParenting bool
45
	// quirks is whether the parser is operating in "quirks mode."
46
	quirks bool
47
	// fragment is whether the parser is parsing an HTML fragment.
48
	fragment bool
49
	// context is the context element when parsing an HTML fragment
50
	// (section 12.4).
51
	context *Node
52
}
53

54
func (p *parser) top() *Node {
55
	if n := p.oe.top(); n != nil {
56
		return n
57
	}
58
	return p.doc
59
}
60

61
// Stop tags for use in popUntil. These come from section 12.2.4.2.
62
var (
63
	defaultScopeStopTags = map[string][]a.Atom{
64
		"":     {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65
		"math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66
		"svg":  {a.Desc, a.ForeignObject, a.Title},
67
	}
68
)
69

70
type scope int
71

72
const (
73
	defaultScope scope = iota
74
	listItemScope
75
	buttonScope
76
	tableScope
77
	tableRowScope
78
	tableBodyScope
79
	selectScope
80
)
81

82
// popUntil pops the stack of open elements at the highest element whose tag
83
// is in matchTags, provided there is no higher element in the scope's stop
84
// tags (as defined in section 12.2.4.2). It returns whether or not there was
85
// such an element. If there was not, popUntil leaves the stack unchanged.
86
//
87
// For example, the set of stop tags for table scope is: "html", "table". If
88
// the stack was:
89
// ["html", "body", "font", "table", "b", "i", "u"]
90
// then popUntil(tableScope, "font") would return false, but
91
// popUntil(tableScope, "i") would return true and the stack would become:
92
// ["html", "body", "font", "table", "b"]
93
//
94
// If an element's tag is in both the stop tags and matchTags, then the stack
95
// will be popped and the function returns true (provided, of course, there was
96
// no higher element in the stack that was also in the stop tags). For example,
97
// popUntil(tableScope, "table") returns true and leaves:
98
// ["html", "body", "font"]
99
func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100
	if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
101
		p.oe = p.oe[:i]
102
		return true
103
	}
104
	return false
105
}
106

107
// indexOfElementInScope returns the index in p.oe of the highest element whose
108
// tag is in matchTags that is in scope. If no matching element is in scope, it
109
// returns -1.
110
func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111
	for i := len(p.oe) - 1; i >= 0; i-- {
112
		tagAtom := p.oe[i].DataAtom
113
		if p.oe[i].Namespace == "" {
114
			for _, t := range matchTags {
115
				if t == tagAtom {
116
					return i
117
				}
118
			}
119
			switch s {
120
			case defaultScope:
121
				// No-op.
122
			case listItemScope:
123
				if tagAtom == a.Ol || tagAtom == a.Ul {
124
					return -1
125
				}
126
			case buttonScope:
127
				if tagAtom == a.Button {
128
					return -1
129
				}
130
			case tableScope:
131
				if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
132
					return -1
133
				}
134
			case selectScope:
135
				if tagAtom != a.Optgroup && tagAtom != a.Option {
136
					return -1
137
				}
138
			default:
139
				panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s))
140
			}
141
		}
142
		switch s {
143
		case defaultScope, listItemScope, buttonScope:
144
			for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
145
				if t == tagAtom {
146
					return -1
147
				}
148
			}
149
		}
150
	}
151
	return -1
152
}
153

154
// elementInScope is like popUntil, except that it doesn't modify the stack of
155
// open elements.
156
func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157
	return p.indexOfElementInScope(s, matchTags...) != -1
158
}
159

160
// clearStackToContext pops elements off the stack of open elements until a
161
// scope-defined element is found.
162
func (p *parser) clearStackToContext(s scope) {
163
	for i := len(p.oe) - 1; i >= 0; i-- {
164
		tagAtom := p.oe[i].DataAtom
165
		switch s {
166
		case tableScope:
167
			if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
168
				p.oe = p.oe[:i+1]
169
				return
170
			}
171
		case tableRowScope:
172
			if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
173
				p.oe = p.oe[:i+1]
174
				return
175
			}
176
		case tableBodyScope:
177
			if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
178
				p.oe = p.oe[:i+1]
179
				return
180
			}
181
		default:
182
			panic(fmt.Sprintf("html: internal error: clearStackToContext unknown scope: %d", s))
183
		}
184
	}
185
}
186

187
// parseGenericRawTextElement implements the generic raw text element parsing
188
// algorithm defined in 12.2.6.2.
189
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
190
// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
191
// officially, need to make tokenizer consider both states.
192
func (p *parser) parseGenericRawTextElement() {
193
	p.addElement()
194
	p.originalIM = p.im
195
	p.im = textIM
196
}
197

198
// generateImpliedEndTags pops nodes off the stack of open elements as long as
199
// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
200
// If exceptions are specified, nodes with that name will not be popped off.
201
func (p *parser) generateImpliedEndTags(exceptions ...string) {
202
	var i int
203
loop:
204
	for i = len(p.oe) - 1; i >= 0; i-- {
205
		n := p.oe[i]
206
		if n.Type != ElementNode {
207
			break
208
		}
209
		switch n.DataAtom {
210
		case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
211
			for _, except := range exceptions {
212
				if n.Data == except {
213
					break loop
214
				}
215
			}
216
			continue
217
		}
218
		break
219
	}
220

221
	p.oe = p.oe[:i+1]
222
}
223

224
// addChild adds a child node n to the top element, and pushes n onto the stack
225
// of open elements if it is an element node.
226
func (p *parser) addChild(n *Node) {
227
	if p.shouldFosterParent() {
228
		p.fosterParent(n)
229
	} else {
230
		p.top().AppendChild(n)
231
	}
232

233
	if n.Type == ElementNode {
234
		p.insertOpenElement(n)
235
	}
236
}
237

238
func (p *parser) insertOpenElement(n *Node) {
239
	p.oe = append(p.oe, n)
240
	if len(p.oe) > 512 {
241
		panic("html: open stack of elements exceeds 512 nodes")
242
	}
243
}
244

245
// shouldFosterParent returns whether the next node to be added should be
246
// foster parented.
247
func (p *parser) shouldFosterParent() bool {
248
	if p.fosterParenting {
249
		switch p.top().DataAtom {
250
		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
251
			return true
252
		}
253
	}
254
	return false
255
}
256

257
// fosterParent adds a child node according to the foster parenting rules.
258
// Section 12.2.6.1, "foster parenting".
259
func (p *parser) fosterParent(n *Node) {
260
	var table, parent, prev, template *Node
261
	var i int
262
	for i = len(p.oe) - 1; i >= 0; i-- {
263
		if p.oe[i].DataAtom == a.Table {
264
			table = p.oe[i]
265
			break
266
		}
267
	}
268

269
	var j int
270
	for j = len(p.oe) - 1; j >= 0; j-- {
271
		if p.oe[j].DataAtom == a.Template {
272
			template = p.oe[j]
273
			break
274
		}
275
	}
276

277
	if template != nil && (table == nil || j > i) {
278
		template.AppendChild(n)
279
		return
280
	}
281

282
	if table == nil {
283
		// The foster parent is the html element.
284
		parent = p.oe[0]
285
	} else {
286
		parent = table.Parent
287
	}
288
	if parent == nil {
289
		parent = p.oe[i-1]
290
	}
291

292
	if table != nil {
293
		prev = table.PrevSibling
294
	} else {
295
		prev = parent.LastChild
296
	}
297
	if prev != nil && prev.Type == TextNode && n.Type == TextNode {
298
		prev.Data += n.Data
299
		return
300
	}
301

302
	parent.InsertBefore(n, table)
303
}
304

305
// addText adds text to the preceding node if it is a text node, or else it
306
// calls addChild with a new text node.
307
func (p *parser) addText(text string) {
308
	if text == "" {
309
		return
310
	}
311

312
	if p.shouldFosterParent() {
313
		p.fosterParent(&Node{
314
			Type: TextNode,
315
			Data: text,
316
		})
317
		return
318
	}
319

320
	t := p.top()
321
	if n := t.LastChild; n != nil && n.Type == TextNode {
322
		n.Data += text
323
		return
324
	}
325
	p.addChild(&Node{
326
		Type: TextNode,
327
		Data: text,
328
	})
329
}
330

331
// addElement adds a child element based on the current token.
332
func (p *parser) addElement() {
333
	p.addChild(&Node{
334
		Type:     ElementNode,
335
		DataAtom: p.tok.DataAtom,
336
		Data:     p.tok.Data,
337
		Attr:     p.tok.Attr,
338
	})
339
}
340

341
// Section 12.2.4.3.
342
func (p *parser) addFormattingElement() {
343
	tagAtom, attr := p.tok.DataAtom, p.tok.Attr
344
	p.addElement()
345

346
	// Implement the Noah's Ark clause, but with three per family instead of two.
347
	identicalElements := 0
348
findIdenticalElements:
349
	for i := len(p.afe) - 1; i >= 0; i-- {
350
		n := p.afe[i]
351
		if n.Type == scopeMarkerNode {
352
			break
353
		}
354
		if n.Type != ElementNode {
355
			continue
356
		}
357
		if n.Namespace != "" {
358
			continue
359
		}
360
		if n.DataAtom != tagAtom {
361
			continue
362
		}
363
		if len(n.Attr) != len(attr) {
364
			continue
365
		}
366
	compareAttributes:
367
		for _, t0 := range n.Attr {
368
			for _, t1 := range attr {
369
				if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
370
					// Found a match for this attribute, continue with the next attribute.
371
					continue compareAttributes
372
				}
373
			}
374
			// If we get here, there is no attribute that matches a.
375
			// Therefore the element is not identical to the new one.
376
			continue findIdenticalElements
377
		}
378

379
		identicalElements++
380
		if identicalElements >= 3 {
381
			p.afe.remove(n)
382
		}
383
	}
384

385
	p.afe = append(p.afe, p.top())
386
}
387

388
// Section 12.2.4.3.
389
func (p *parser) clearActiveFormattingElements() {
390
	for {
391
		if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
392
			return
393
		}
394
	}
395
}
396

397
// Section 12.2.4.3.
398
func (p *parser) reconstructActiveFormattingElements() {
399
	n := p.afe.top()
400
	if n == nil {
401
		return
402
	}
403
	if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
404
		return
405
	}
406
	i := len(p.afe) - 1
407
	for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
408
		if i == 0 {
409
			i = -1
410
			break
411
		}
412
		i--
413
		n = p.afe[i]
414
	}
415
	for {
416
		i++
417
		clone := p.afe[i].clone()
418
		p.addChild(clone)
419
		p.afe[i] = clone
420
		if i == len(p.afe)-1 {
421
			break
422
		}
423
	}
424
}
425

426
// Section 12.2.5.
427
func (p *parser) acknowledgeSelfClosingTag() {
428
	p.hasSelfClosingToken = false
429
}
430

431
// An insertion mode (section 12.2.4.1) is the state transition function from
432
// a particular state in the HTML5 parser's state machine. It updates the
433
// parser's fields depending on parser.tok (where ErrorToken means EOF).
434
// It returns whether the token was consumed.
435
type insertionMode func(*parser) bool
436

437
// setOriginalIM sets the insertion mode to return to after completing a text or
438
// inTableText insertion mode.
439
// Section 12.2.4.1, "using the rules for".
440
func (p *parser) setOriginalIM() {
441
	if p.originalIM != nil {
442
		panic("html: bad parser state: originalIM was set twice")
443
	}
444
	p.originalIM = p.im
445
}
446

447
// Section 12.2.4.1, "reset the insertion mode".
448
func (p *parser) resetInsertionMode() {
449
	for i := len(p.oe) - 1; i >= 0; i-- {
450
		n := p.oe[i]
451
		last := i == 0
452
		if last && p.context != nil {
453
			n = p.context
454
		}
455

456
		switch n.DataAtom {
457
		case a.Select:
458
			if !last {
459
				for ancestor, first := n, p.oe[0]; ancestor != first; {
460
					ancestor = p.oe[p.oe.index(ancestor)-1]
461
					switch ancestor.DataAtom {
462
					case a.Template:
463
						p.im = inSelectIM
464
						return
465
					case a.Table:
466
						p.im = inSelectInTableIM
467
						return
468
					}
469
				}
470
			}
471
			p.im = inSelectIM
472
		case a.Td, a.Th:
473
			// TODO: remove this divergence from the HTML5 spec.
474
			//
475
			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
476
			p.im = inCellIM
477
		case a.Tr:
478
			p.im = inRowIM
479
		case a.Tbody, a.Thead, a.Tfoot:
480
			p.im = inTableBodyIM
481
		case a.Caption:
482
			p.im = inCaptionIM
483
		case a.Colgroup:
484
			p.im = inColumnGroupIM
485
		case a.Table:
486
			p.im = inTableIM
487
		case a.Template:
488
			// TODO: remove this divergence from the HTML5 spec.
489
			if n.Namespace != "" {
490
				continue
491
			}
492
			p.im = p.templateStack.top()
493
		case a.Head:
494
			// TODO: remove this divergence from the HTML5 spec.
495
			//
496
			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
497
			p.im = inHeadIM
498
		case a.Body:
499
			p.im = inBodyIM
500
		case a.Frameset:
501
			p.im = inFramesetIM
502
		case a.Html:
503
			if p.head == nil {
504
				p.im = beforeHeadIM
505
			} else {
506
				p.im = afterHeadIM
507
			}
508
		default:
509
			if last {
510
				p.im = inBodyIM
511
				return
512
			}
513
			continue
514
		}
515
		return
516
	}
517
}
518

519
const whitespace = " \t\r\n\f"
520

521
// Section 12.2.6.4.1.
522
func initialIM(p *parser) bool {
523
	switch p.tok.Type {
524
	case TextToken:
525
		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
526
		if len(p.tok.Data) == 0 {
527
			// It was all whitespace, so ignore it.
528
			return true
529
		}
530
	case CommentToken:
531
		p.doc.AppendChild(&Node{
532
			Type: CommentNode,
533
			Data: p.tok.Data,
534
		})
535
		return true
536
	case DoctypeToken:
537
		n, quirks := parseDoctype(p.tok.Data)
538
		p.doc.AppendChild(n)
539
		p.quirks = quirks
540
		p.im = beforeHTMLIM
541
		return true
542
	}
543
	p.quirks = true
544
	p.im = beforeHTMLIM
545
	return false
546
}
547

548
// Section 12.2.6.4.2.
549
func beforeHTMLIM(p *parser) bool {
550
	switch p.tok.Type {
551
	case DoctypeToken:
552
		// Ignore the token.
553
		return true
554
	case TextToken:
555
		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
556
		if len(p.tok.Data) == 0 {
557
			// It was all whitespace, so ignore it.
558
			return true
559
		}
560
	case StartTagToken:
561
		if p.tok.DataAtom == a.Html {
562
			p.addElement()
563
			p.im = beforeHeadIM
564
			return true
565
		}
566
	case EndTagToken:
567
		switch p.tok.DataAtom {
568
		case a.Head, a.Body, a.Html, a.Br:
569
			p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
570
			return false
571
		default:
572
			// Ignore the token.
573
			return true
574
		}
575
	case CommentToken:
576
		p.doc.AppendChild(&Node{
577
			Type: CommentNode,
578
			Data: p.tok.Data,
579
		})
580
		return true
581
	}
582
	p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
583
	return false
584
}
585

586
// Section 12.2.6.4.3.
587
func beforeHeadIM(p *parser) bool {
588
	switch p.tok.Type {
589
	case TextToken:
590
		p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
591
		if len(p.tok.Data) == 0 {
592
			// It was all whitespace, so ignore it.
593
			return true
594
		}
595
	case StartTagToken:
596
		switch p.tok.DataAtom {
597
		case a.Head:
598
			p.addElement()
599
			p.head = p.top()
600
			p.im = inHeadIM
601
			return true
602
		case a.Html:
603
			return inBodyIM(p)
604
		}
605
	case EndTagToken:
606
		switch p.tok.DataAtom {
607
		case a.Head, a.Body, a.Html, a.Br:
608
			p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
609
			return false
610
		default:
611
			// Ignore the token.
612
			return true
613
		}
614
	case CommentToken:
615
		p.addChild(&Node{
616
			Type: CommentNode,
617
			Data: p.tok.Data,
618
		})
619
		return true
620
	case DoctypeToken:
621
		// Ignore the token.
622
		return true
623
	}
624

625
	p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
626
	return false
627
}
628

629
// Section 12.2.6.4.4.
630
func inHeadIM(p *parser) bool {
631
	switch p.tok.Type {
632
	case TextToken:
633
		s := strings.TrimLeft(p.tok.Data, whitespace)
634
		if len(s) < len(p.tok.Data) {
635
			// Add the initial whitespace to the current node.
636
			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
637
			if s == "" {
638
				return true
639
			}
640
			p.tok.Data = s
641
		}
642
	case StartTagToken:
643
		switch p.tok.DataAtom {
644
		case a.Html:
645
			return inBodyIM(p)
646
		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
647
			p.addElement()
648
			p.oe.pop()
649
			p.acknowledgeSelfClosingTag()
650
			return true
651
		case a.Noscript:
652
			if p.scripting {
653
				p.parseGenericRawTextElement()
654
				return true
655
			}
656
			p.addElement()
657
			p.im = inHeadNoscriptIM
658
			// Don't let the tokenizer go into raw text mode when scripting is disabled.
659
			p.tokenizer.NextIsNotRawText()
660
			return true
661
		case a.Script, a.Title:
662
			p.addElement()
663
			p.setOriginalIM()
664
			p.im = textIM
665
			return true
666
		case a.Noframes, a.Style:
667
			p.parseGenericRawTextElement()
668
			return true
669
		case a.Head:
670
			// Ignore the token.
671
			return true
672
		case a.Template:
673
			// TODO: remove this divergence from the HTML5 spec.
674
			//
675
			// We don't handle all of the corner cases when mixing foreign
676
			// content (i.e. <math> or <svg>) with <template>. Without this
677
			// early return, we can get into an infinite loop, possibly because
678
			// of the "TODO... further divergence" a little below.
679
			//
680
			// As a workaround, if we are mixing foreign content and templates,
681
			// just ignore the rest of the HTML. Foreign content is rare and a
682
			// relatively old HTML feature. Templates are also rare and a
683
			// relatively new HTML feature. Their combination is very rare.
684
			for _, e := range p.oe {
685
				if e.Namespace != "" {
686
					p.im = ignoreTheRemainingTokens
687
					return true
688
				}
689
			}
690

691
			p.addElement()
692
			p.afe = append(p.afe, &scopeMarker)
693
			p.framesetOK = false
694
			p.im = inTemplateIM
695
			p.templateStack = append(p.templateStack, inTemplateIM)
696
			return true
697
		}
698
	case EndTagToken:
699
		switch p.tok.DataAtom {
700
		case a.Head:
701
			p.oe.pop()
702
			p.im = afterHeadIM
703
			return true
704
		case a.Body, a.Html, a.Br:
705
			p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
706
			return false
707
		case a.Template:
708
			if !p.oe.contains(a.Template) {
709
				return true
710
			}
711
			// TODO: remove this further divergence from the HTML5 spec.
712
			//
713
			// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
714
			p.generateImpliedEndTags()
715
			for i := len(p.oe) - 1; i >= 0; i-- {
716
				if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
717
					p.oe = p.oe[:i]
718
					break
719
				}
720
			}
721
			p.clearActiveFormattingElements()
722
			p.templateStack.pop()
723
			p.resetInsertionMode()
724
			return true
725
		default:
726
			// Ignore the token.
727
			return true
728
		}
729
	case CommentToken:
730
		p.addChild(&Node{
731
			Type: CommentNode,
732
			Data: p.tok.Data,
733
		})
734
		return true
735
	case DoctypeToken:
736
		// Ignore the token.
737
		return true
738
	}
739

740
	p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
741
	return false
742
}
743

744
// Section 12.2.6.4.5.
745
func inHeadNoscriptIM(p *parser) bool {
746
	switch p.tok.Type {
747
	case DoctypeToken:
748
		// Ignore the token.
749
		return true
750
	case StartTagToken:
751
		switch p.tok.DataAtom {
752
		case a.Html:
753
			return inBodyIM(p)
754
		case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
755
			return inHeadIM(p)
756
		case a.Head:
757
			// Ignore the token.
758
			return true
759
		case a.Noscript:
760
			// Don't let the tokenizer go into raw text mode even when a <noscript>
761
			// tag is in "in head noscript" insertion mode.
762
			p.tokenizer.NextIsNotRawText()
763
			// Ignore the token.
764
			return true
765
		}
766
	case EndTagToken:
767
		switch p.tok.DataAtom {
768
		case a.Noscript, a.Br:
769
		default:
770
			// Ignore the token.
771
			return true
772
		}
773
	case TextToken:
774
		s := strings.TrimLeft(p.tok.Data, whitespace)
775
		if len(s) == 0 {
776
			// It was all whitespace.
777
			return inHeadIM(p)
778
		}
779
	case CommentToken:
780
		return inHeadIM(p)
781
	}
782
	p.oe.pop()
783
	if p.top().DataAtom != a.Head {
784
		panic("html: the new current node will be a head element.")
785
	}
786
	p.im = inHeadIM
787
	if p.tok.DataAtom == a.Noscript {
788
		return true
789
	}
790
	return false
791
}
792

793
// Section 12.2.6.4.6.
794
func afterHeadIM(p *parser) bool {
795
	switch p.tok.Type {
796
	case TextToken:
797
		s := strings.TrimLeft(p.tok.Data, whitespace)
798
		if len(s) < len(p.tok.Data) {
799
			// Add the initial whitespace to the current node.
800
			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
801
			if s == "" {
802
				return true
803
			}
804
			p.tok.Data = s
805
		}
806
	case StartTagToken:
807
		switch p.tok.DataAtom {
808
		case a.Html:
809
			return inBodyIM(p)
810
		case a.Body:
811
			p.addElement()
812
			p.framesetOK = false
813
			p.im = inBodyIM
814
			return true
815
		case a.Frameset:
816
			p.addElement()
817
			p.im = inFramesetIM
818
			return true
819
		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
820
			p.insertOpenElement(p.head)
821
			defer p.oe.remove(p.head)
822
			return inHeadIM(p)
823
		case a.Head:
824
			// Ignore the token.
825
			return true
826
		}
827
	case EndTagToken:
828
		switch p.tok.DataAtom {
829
		case a.Body, a.Html, a.Br:
830
			// Drop down to creating an implied <body> tag.
831
		case a.Template:
832
			return inHeadIM(p)
833
		default:
834
			// Ignore the token.
835
			return true
836
		}
837
	case CommentToken:
838
		p.addChild(&Node{
839
			Type: CommentNode,
840
			Data: p.tok.Data,
841
		})
842
		return true
843
	case DoctypeToken:
844
		// Ignore the token.
845
		return true
846
	}
847

848
	p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
849
	p.framesetOK = true
850
	if p.tok.Type == ErrorToken {
851
		// Stop parsing.
852
		return true
853
	}
854
	return false
855
}
856

857
// copyAttributes copies attributes of src not found on dst to dst.
858
func copyAttributes(dst *Node, src Token) {
859
	if len(src.Attr) == 0 {
860
		return
861
	}
862
	attr := map[string]string{}
863
	for _, t := range dst.Attr {
864
		attr[t.Key] = t.Val
865
	}
866
	for _, t := range src.Attr {
867
		if _, ok := attr[t.Key]; !ok {
868
			dst.Attr = append(dst.Attr, t)
869
			attr[t.Key] = t.Val
870
		}
871
	}
872
}
873

874
// Section 12.2.6.4.7.
875
func inBodyIM(p *parser) bool {
876
	switch p.tok.Type {
877
	case TextToken:
878
		d := p.tok.Data
879
		switch n := p.oe.top(); n.DataAtom {
880
		case a.Pre, a.Listing:
881
			if n.FirstChild == nil {
882
				// Ignore a newline at the start of a <pre> block.
883
				if d != "" && d[0] == '\r' {
884
					d = d[1:]
885
				}
886
				if d != "" && d[0] == '\n' {
887
					d = d[1:]
888
				}
889
			}
890
		}
891
		d = strings.Replace(d, "\x00", "", -1)
892
		if d == "" {
893
			return true
894
		}
895
		p.reconstructActiveFormattingElements()
896
		p.addText(d)
897
		if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
898
			// There were non-whitespace characters inserted.
899
			p.framesetOK = false
900
		}
901
	case StartTagToken:
902
		switch p.tok.DataAtom {
903
		case a.Html:
904
			if p.oe.contains(a.Template) {
905
				return true
906
			}
907
			copyAttributes(p.oe[0], p.tok)
908
		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
909
			return inHeadIM(p)
910
		case a.Body:
911
			if p.oe.contains(a.Template) {
912
				return true
913
			}
914
			if len(p.oe) >= 2 {
915
				body := p.oe[1]
916
				if body.Type == ElementNode && body.DataAtom == a.Body {
917
					p.framesetOK = false
918
					copyAttributes(body, p.tok)
919
				}
920
			}
921
		case a.Frameset:
922
			if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
923
				// Ignore the token.
924
				return true
925
			}
926
			body := p.oe[1]
927
			if body.Parent != nil {
928
				body.Parent.RemoveChild(body)
929
			}
930
			p.oe = p.oe[:1]
931
			p.addElement()
932
			p.im = inFramesetIM
933
			return true
934
		case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:
935
			p.popUntil(buttonScope, a.P)
936
			p.addElement()
937
		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
938
			p.popUntil(buttonScope, a.P)
939
			switch n := p.top(); n.DataAtom {
940
			case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
941
				p.oe.pop()
942
			}
943
			p.addElement()
944
		case a.Pre, a.Listing:
945
			p.popUntil(buttonScope, a.P)
946
			p.addElement()
947
			// The newline, if any, will be dealt with by the TextToken case.
948
			p.framesetOK = false
949
		case a.Form:
950
			if p.form != nil && !p.oe.contains(a.Template) {
951
				// Ignore the token
952
				return true
953
			}
954
			p.popUntil(buttonScope, a.P)
955
			p.addElement()
956
			if !p.oe.contains(a.Template) {
957
				p.form = p.top()
958
			}
959
		case a.Li:
960
			p.framesetOK = false
961
			for i := len(p.oe) - 1; i >= 0; i-- {
962
				node := p.oe[i]
963
				switch node.DataAtom {
964
				case a.Li:
965
					p.oe = p.oe[:i]
966
				case a.Address, a.Div, a.P:
967
					continue
968
				default:
969
					if !isSpecialElement(node) {
970
						continue
971
					}
972
				}
973
				break
974
			}
975
			p.popUntil(buttonScope, a.P)
976
			p.addElement()
977
		case a.Dd, a.Dt:
978
			p.framesetOK = false
979
			for i := len(p.oe) - 1; i >= 0; i-- {
980
				node := p.oe[i]
981
				switch node.DataAtom {
982
				case a.Dd, a.Dt:
983
					p.oe = p.oe[:i]
984
				case a.Address, a.Div, a.P:
985
					continue
986
				default:
987
					if !isSpecialElement(node) {
988
						continue
989
					}
990
				}
991
				break
992
			}
993
			p.popUntil(buttonScope, a.P)
994
			p.addElement()
995
		case a.Plaintext:
996
			p.popUntil(buttonScope, a.P)
997
			p.addElement()
998
		case a.Button:
999
			p.popUntil(defaultScope, a.Button)
1000
			p.reconstructActiveFormattingElements()
1001
			p.addElement()
1002
			p.framesetOK = false
1003
		case a.A:
1004
			for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
1005
				if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
1006
					p.inBodyEndTagFormatting(a.A, "a")
1007
					p.oe.remove(n)
1008
					p.afe.remove(n)
1009
					break
1010
				}
1011
			}
1012
			p.reconstructActiveFormattingElements()
1013
			p.addFormattingElement()
1014
		case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1015
			p.reconstructActiveFormattingElements()
1016
			p.addFormattingElement()
1017
		case a.Nobr:
1018
			p.reconstructActiveFormattingElements()
1019
			if p.elementInScope(defaultScope, a.Nobr) {
1020
				p.inBodyEndTagFormatting(a.Nobr, "nobr")
1021
				p.reconstructActiveFormattingElements()
1022
			}
1023
			p.addFormattingElement()
1024
		case a.Applet, a.Marquee, a.Object:
1025
			p.reconstructActiveFormattingElements()
1026
			p.addElement()
1027
			p.afe = append(p.afe, &scopeMarker)
1028
			p.framesetOK = false
1029
		case a.Table:
1030
			if !p.quirks {
1031
				p.popUntil(buttonScope, a.P)
1032
			}
1033
			p.addElement()
1034
			p.framesetOK = false
1035
			p.im = inTableIM
1036
			return true
1037
		case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
1038
			p.reconstructActiveFormattingElements()
1039
			p.addElement()
1040
			p.oe.pop()
1041
			p.acknowledgeSelfClosingTag()
1042
			if p.tok.DataAtom == a.Input {
1043
				for _, t := range p.tok.Attr {
1044
					if t.Key == "type" {
1045
						if strings.EqualFold(t.Val, "hidden") {
1046
							// Skip setting framesetOK = false
1047
							return true
1048
						}
1049
					}
1050
				}
1051
			}
1052
			p.framesetOK = false
1053
		case a.Param, a.Source, a.Track:
1054
			p.addElement()
1055
			p.oe.pop()
1056
			p.acknowledgeSelfClosingTag()
1057
		case a.Hr:
1058
			p.popUntil(buttonScope, a.P)
1059
			p.addElement()
1060
			p.oe.pop()
1061
			p.acknowledgeSelfClosingTag()
1062
			p.framesetOK = false
1063
		case a.Image:
1064
			p.tok.DataAtom = a.Img
1065
			p.tok.Data = a.Img.String()
1066
			return false
1067
		case a.Textarea:
1068
			p.addElement()
1069
			p.setOriginalIM()
1070
			p.framesetOK = false
1071
			p.im = textIM
1072
		case a.Xmp:
1073
			p.popUntil(buttonScope, a.P)
1074
			p.reconstructActiveFormattingElements()
1075
			p.framesetOK = false
1076
			p.parseGenericRawTextElement()
1077
		case a.Iframe:
1078
			p.framesetOK = false
1079
			p.parseGenericRawTextElement()
1080
		case a.Noembed:
1081
			p.parseGenericRawTextElement()
1082
		case a.Noscript:
1083
			if p.scripting {
1084
				p.parseGenericRawTextElement()
1085
				return true
1086
			}
1087
			p.reconstructActiveFormattingElements()
1088
			p.addElement()
1089
			// Don't let the tokenizer go into raw text mode when scripting is disabled.
1090
			p.tokenizer.NextIsNotRawText()
1091
		case a.Select:
1092
			p.reconstructActiveFormattingElements()
1093
			p.addElement()
1094
			p.framesetOK = false
1095
			p.im = inSelectIM
1096
			return true
1097
		case a.Optgroup, a.Option:
1098
			if p.top().DataAtom == a.Option {
1099
				p.oe.pop()
1100
			}
1101
			p.reconstructActiveFormattingElements()
1102
			p.addElement()
1103
		case a.Rb, a.Rtc:
1104
			if p.elementInScope(defaultScope, a.Ruby) {
1105
				p.generateImpliedEndTags()
1106
			}
1107
			p.addElement()
1108
		case a.Rp, a.Rt:
1109
			if p.elementInScope(defaultScope, a.Ruby) {
1110
				p.generateImpliedEndTags("rtc")
1111
			}
1112
			p.addElement()
1113
		case a.Math, a.Svg:
1114
			p.reconstructActiveFormattingElements()
1115
			if p.tok.DataAtom == a.Math {
1116
				adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1117
			} else {
1118
				adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1119
			}
1120
			adjustForeignAttributes(p.tok.Attr)
1121
			p.addElement()
1122
			p.top().Namespace = p.tok.Data
1123
			if p.hasSelfClosingToken {
1124
				p.oe.pop()
1125
				p.acknowledgeSelfClosingTag()
1126
			}
1127
			return true
1128
		case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1129
			// Ignore the token.
1130
		default:
1131
			p.reconstructActiveFormattingElements()
1132
			p.addElement()
1133
		}
1134
	case EndTagToken:
1135
		switch p.tok.DataAtom {
1136
		case a.Body:
1137
			if p.elementInScope(defaultScope, a.Body) {
1138
				p.im = afterBodyIM
1139
			}
1140
		case a.Html:
1141
			if p.elementInScope(defaultScope, a.Body) {
1142
				p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1143
				return false
1144
			}
1145
			return true
1146
		case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:
1147
			p.popUntil(defaultScope, p.tok.DataAtom)
1148
		case a.Form:
1149
			if p.oe.contains(a.Template) {
1150
				i := p.indexOfElementInScope(defaultScope, a.Form)
1151
				if i == -1 {
1152
					// Ignore the token.
1153
					return true
1154
				}
1155
				p.generateImpliedEndTags()
1156
				if p.oe[i].DataAtom != a.Form {
1157
					// Ignore the token.
1158
					return true
1159
				}
1160
				p.popUntil(defaultScope, a.Form)
1161
			} else {
1162
				node := p.form
1163
				p.form = nil
1164
				i := p.indexOfElementInScope(defaultScope, a.Form)
1165
				if node == nil || i == -1 || p.oe[i] != node {
1166
					// Ignore the token.
1167
					return true
1168
				}
1169
				p.generateImpliedEndTags()
1170
				p.oe.remove(node)
1171
			}
1172
		case a.P:
1173
			if !p.elementInScope(buttonScope, a.P) {
1174
				p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1175
			}
1176
			p.popUntil(buttonScope, a.P)
1177
		case a.Li:
1178
			p.popUntil(listItemScope, a.Li)
1179
		case a.Dd, a.Dt:
1180
			p.popUntil(defaultScope, p.tok.DataAtom)
1181
		case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1182
			p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1183
		case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1184
			p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
1185
		case a.Applet, a.Marquee, a.Object:
1186
			if p.popUntil(defaultScope, p.tok.DataAtom) {
1187
				p.clearActiveFormattingElements()
1188
			}
1189
		case a.Br:
1190
			p.tok.Type = StartTagToken
1191
			return false
1192
		case a.Template:
1193
			return inHeadIM(p)
1194
		default:
1195
			p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
1196
		}
1197
	case CommentToken:
1198
		p.addChild(&Node{
1199
			Type: CommentNode,
1200
			Data: p.tok.Data,
1201
		})
1202
	case ErrorToken:
1203
		// TODO: remove this divergence from the HTML5 spec.
1204
		if len(p.templateStack) > 0 {
1205
			p.im = inTemplateIM
1206
			return false
1207
		}
1208
		for _, e := range p.oe {
1209
			switch e.DataAtom {
1210
			case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1211
				a.Thead, a.Tr, a.Body, a.Html:
1212
			default:
1213
				return true
1214
			}
1215
		}
1216
	}
1217

1218
	return true
1219
}
1220

1221
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
1222
	// This is the "adoption agency" algorithm, described at
1223
	// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1224

1225
	// TODO: this is a fairly literal line-by-line translation of that algorithm.
1226
	// Once the code successfully parses the comprehensive test suite, we should
1227
	// refactor this code to be more idiomatic.
1228

1229
	// Steps 1-2
1230
	if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
1231
		p.oe.pop()
1232
		return
1233
	}
1234

1235
	// Steps 3-5. The outer loop.
1236
	for i := 0; i < 8; i++ {
1237
		// Step 6. Find the formatting element.
1238
		var formattingElement *Node
1239
		for j := len(p.afe) - 1; j >= 0; j-- {
1240
			if p.afe[j].Type == scopeMarkerNode {
1241
				break
1242
			}
1243
			if p.afe[j].DataAtom == tagAtom {
1244
				formattingElement = p.afe[j]
1245
				break
1246
			}
1247
		}
1248
		if formattingElement == nil {
1249
			p.inBodyEndTagOther(tagAtom, tagName)
1250
			return
1251
		}
1252

1253
		// Step 7. Ignore the tag if formatting element is not in the stack of open elements.
1254
		feIndex := p.oe.index(formattingElement)
1255
		if feIndex == -1 {
1256
			p.afe.remove(formattingElement)
1257
			return
1258
		}
1259
		// Step 8. Ignore the tag if formatting element is not in the scope.
1260
		if !p.elementInScope(defaultScope, tagAtom) {
1261
			// Ignore the tag.
1262
			return
1263
		}
1264

1265
		// Step 9. This step is omitted because it's just a parse error but no need to return.
1266

1267
		// Steps 10-11. Find the furthest block.
1268
		var furthestBlock *Node
1269
		for _, e := range p.oe[feIndex:] {
1270
			if isSpecialElement(e) {
1271
				furthestBlock = e
1272
				break
1273
			}
1274
		}
1275
		if furthestBlock == nil {
1276
			e := p.oe.pop()
1277
			for e != formattingElement {
1278
				e = p.oe.pop()
1279
			}
1280
			p.afe.remove(e)
1281
			return
1282
		}
1283

1284
		// Steps 12-13. Find the common ancestor and bookmark node.
1285
		commonAncestor := p.oe[feIndex-1]
1286
		bookmark := p.afe.index(formattingElement)
1287

1288
		// Step 14. The inner loop. Find the lastNode to reparent.
1289
		lastNode := furthestBlock
1290
		node := furthestBlock
1291
		x := p.oe.index(node)
1292
		// Step 14.1.
1293
		j := 0
1294
		for {
1295
			// Step 14.2.
1296
			j++
1297
			// Step. 14.3.
1298
			x--
1299
			node = p.oe[x]
1300
			// Step 14.4. Go to the next step if node is formatting element.
1301
			if node == formattingElement {
1302
				break
1303
			}
1304
			// Step 14.5. Remove node from the list of active formatting elements if
1305
			// inner loop counter is greater than three and node is in the list of
1306
			// active formatting elements.
1307
			if ni := p.afe.index(node); j > 3 && ni > -1 {
1308
				p.afe.remove(node)
1309
				// If any element of the list of active formatting elements is removed,
1310
				// we need to take care whether bookmark should be decremented or not.
1311
				// This is because the value of bookmark may exceed the size of the
1312
				// list by removing elements from the list.
1313
				if ni <= bookmark {
1314
					bookmark--
1315
				}
1316
				continue
1317
			}
1318
			// Step 14.6. Continue the next inner loop if node is not in the list of
1319
			// active formatting elements.
1320
			if p.afe.index(node) == -1 {
1321
				p.oe.remove(node)
1322
				continue
1323
			}
1324
			// Step 14.7.
1325
			clone := node.clone()
1326
			p.afe[p.afe.index(node)] = clone
1327
			p.oe[p.oe.index(node)] = clone
1328
			node = clone
1329
			// Step 14.8.
1330
			if lastNode == furthestBlock {
1331
				bookmark = p.afe.index(node) + 1
1332
			}
1333
			// Step 14.9.
1334
			if lastNode.Parent != nil {
1335
				lastNode.Parent.RemoveChild(lastNode)
1336
			}
1337
			node.AppendChild(lastNode)
1338
			// Step 14.10.
1339
			lastNode = node
1340
		}
1341

1342
		// Step 15. Reparent lastNode to the common ancestor,
1343
		// or for misnested table nodes, to the foster parent.
1344
		if lastNode.Parent != nil {
1345
			lastNode.Parent.RemoveChild(lastNode)
1346
		}
1347
		switch commonAncestor.DataAtom {
1348
		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1349
			p.fosterParent(lastNode)
1350
		default:
1351
			commonAncestor.AppendChild(lastNode)
1352
		}
1353

1354
		// Steps 16-18. Reparent nodes from the furthest block's children
1355
		// to a clone of the formatting element.
1356
		clone := formattingElement.clone()
1357
		reparentChildren(clone, furthestBlock)
1358
		furthestBlock.AppendChild(clone)
1359

1360
		// Step 19. Fix up the list of active formatting elements.
1361
		if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1362
			// Move the bookmark with the rest of the list.
1363
			bookmark--
1364
		}
1365
		p.afe.remove(formattingElement)
1366
		p.afe.insert(bookmark, clone)
1367

1368
		// Step 20. Fix up the stack of open elements.
1369
		p.oe.remove(formattingElement)
1370
		p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1371
	}
1372
}
1373

1374
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1375
// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1376
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1377
func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
1378
	for i := len(p.oe) - 1; i >= 0; i-- {
1379
		// Two element nodes have the same tag if they have the same Data (a
1380
		// string-typed field). As an optimization, for common HTML tags, each
1381
		// Data string is assigned a unique, non-zero DataAtom (a uint32-typed
1382
		// field), since integer comparison is faster than string comparison.
1383
		// Uncommon (custom) tags get a zero DataAtom.
1384
		//
1385
		// The if condition here is equivalent to (p.oe[i].Data == tagName).
1386
		if (p.oe[i].DataAtom == tagAtom) &&
1387
			((tagAtom != 0) || (p.oe[i].Data == tagName)) {
1388
			p.oe = p.oe[:i]
1389
			break
1390
		}
1391
		if isSpecialElement(p.oe[i]) {
1392
			break
1393
		}
1394
	}
1395
}
1396

1397
// Section 12.2.6.4.8.
1398
func textIM(p *parser) bool {
1399
	switch p.tok.Type {
1400
	case ErrorToken:
1401
		p.oe.pop()
1402
	case TextToken:
1403
		d := p.tok.Data
1404
		if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1405
			// Ignore a newline at the start of a <textarea> block.
1406
			if d != "" && d[0] == '\r' {
1407
				d = d[1:]
1408
			}
1409
			if d != "" && d[0] == '\n' {
1410
				d = d[1:]
1411
			}
1412
		}
1413
		if d == "" {
1414
			return true
1415
		}
1416
		p.addText(d)
1417
		return true
1418
	case EndTagToken:
1419
		p.oe.pop()
1420
	}
1421
	p.im = p.originalIM
1422
	p.originalIM = nil
1423
	return p.tok.Type == EndTagToken
1424
}
1425

1426
// Section 12.2.6.4.9.
1427
func inTableIM(p *parser) bool {
1428
	switch p.tok.Type {
1429
	case TextToken:
1430
		p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1431
		switch p.oe.top().DataAtom {
1432
		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1433
			if strings.Trim(p.tok.Data, whitespace) == "" {
1434
				p.addText(p.tok.Data)
1435
				return true
1436
			}
1437
		}
1438
	case StartTagToken:
1439
		switch p.tok.DataAtom {
1440
		case a.Caption:
1441
			p.clearStackToContext(tableScope)
1442
			p.afe = append(p.afe, &scopeMarker)
1443
			p.addElement()
1444
			p.im = inCaptionIM
1445
			return true
1446
		case a.Colgroup:
1447
			p.clearStackToContext(tableScope)
1448
			p.addElement()
1449
			p.im = inColumnGroupIM
1450
			return true
1451
		case a.Col:
1452
			p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1453
			return false
1454
		case a.Tbody, a.Tfoot, a.Thead:
1455
			p.clearStackToContext(tableScope)
1456
			p.addElement()
1457
			p.im = inTableBodyIM
1458
			return true
1459
		case a.Td, a.Th, a.Tr:
1460
			p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1461
			return false
1462
		case a.Table:
1463
			if p.popUntil(tableScope, a.Table) {
1464
				p.resetInsertionMode()
1465
				return false
1466
			}
1467
			// Ignore the token.
1468
			return true
1469
		case a.Style, a.Script, a.Template:
1470
			return inHeadIM(p)
1471
		case a.Input:
1472
			for _, t := range p.tok.Attr {
1473
				if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {
1474
					p.addElement()
1475
					p.oe.pop()
1476
					return true
1477
				}
1478
			}
1479
			// Otherwise drop down to the default action.
1480
		case a.Form:
1481
			if p.oe.contains(a.Template) || p.form != nil {
1482
				// Ignore the token.
1483
				return true
1484
			}
1485
			p.addElement()
1486
			p.form = p.oe.pop()
1487
		case a.Select:
1488
			p.reconstructActiveFormattingElements()
1489
			switch p.top().DataAtom {
1490
			case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1491
				p.fosterParenting = true
1492
			}
1493
			p.addElement()
1494
			p.fosterParenting = false
1495
			p.framesetOK = false
1496
			p.im = inSelectInTableIM
1497
			return true
1498
		}
1499
	case EndTagToken:
1500
		switch p.tok.DataAtom {
1501
		case a.Table:
1502
			if p.popUntil(tableScope, a.Table) {
1503
				p.resetInsertionMode()
1504
				return true
1505
			}
1506
			// Ignore the token.
1507
			return true
1508
		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1509
			// Ignore the token.
1510
			return true
1511
		case a.Template:
1512
			return inHeadIM(p)
1513
		}
1514
	case CommentToken:
1515
		p.addChild(&Node{
1516
			Type: CommentNode,
1517
			Data: p.tok.Data,
1518
		})
1519
		return true
1520
	case DoctypeToken:
1521
		// Ignore the token.
1522
		return true
1523
	case ErrorToken:
1524
		return inBodyIM(p)
1525
	}
1526

1527
	p.fosterParenting = true
1528
	defer func() { p.fosterParenting = false }()
1529

1530
	return inBodyIM(p)
1531
}
1532

1533
// Section 12.2.6.4.11.
1534
func inCaptionIM(p *parser) bool {
1535
	switch p.tok.Type {
1536
	case StartTagToken:
1537
		switch p.tok.DataAtom {
1538
		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1539
			if !p.popUntil(tableScope, a.Caption) {
1540
				// Ignore the token.
1541
				return true
1542
			}
1543
			p.clearActiveFormattingElements()
1544
			p.im = inTableIM
1545
			return false
1546
		case a.Select:
1547
			p.reconstructActiveFormattingElements()
1548
			p.addElement()
1549
			p.framesetOK = false
1550
			p.im = inSelectInTableIM
1551
			return true
1552
		}
1553
	case EndTagToken:
1554
		switch p.tok.DataAtom {
1555
		case a.Caption:
1556
			if p.popUntil(tableScope, a.Caption) {
1557
				p.clearActiveFormattingElements()
1558
				p.im = inTableIM
1559
			}
1560
			return true
1561
		case a.Table:
1562
			if !p.popUntil(tableScope, a.Caption) {
1563
				// Ignore the token.
1564
				return true
1565
			}
1566
			p.clearActiveFormattingElements()
1567
			p.im = inTableIM
1568
			return false
1569
		case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1570
			// Ignore the token.
1571
			return true
1572
		}
1573
	}
1574
	return inBodyIM(p)
1575
}
1576

1577
// Section 12.2.6.4.12.
1578
func inColumnGroupIM(p *parser) bool {
1579
	switch p.tok.Type {
1580
	case TextToken:
1581
		s := strings.TrimLeft(p.tok.Data, whitespace)
1582
		if len(s) < len(p.tok.Data) {
1583
			// Add the initial whitespace to the current node.
1584
			p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1585
			if s == "" {
1586
				return true
1587
			}
1588
			p.tok.Data = s
1589
		}
1590
	case CommentToken:
1591
		p.addChild(&Node{
1592
			Type: CommentNode,
1593
			Data: p.tok.Data,
1594
		})
1595
		return true
1596
	case DoctypeToken:
1597
		// Ignore the token.
1598
		return true
1599
	case StartTagToken:
1600
		switch p.tok.DataAtom {
1601
		case a.Html:
1602
			return inBodyIM(p)
1603
		case a.Col:
1604
			p.addElement()
1605
			p.oe.pop()
1606
			p.acknowledgeSelfClosingTag()
1607
			return true
1608
		case a.Template:
1609
			return inHeadIM(p)
1610
		}
1611
	case EndTagToken:
1612
		switch p.tok.DataAtom {
1613
		case a.Colgroup:
1614
			if p.oe.top().DataAtom == a.Colgroup {
1615
				p.oe.pop()
1616
				p.im = inTableIM
1617
			}
1618
			return true
1619
		case a.Col:
1620
			// Ignore the token.
1621
			return true
1622
		case a.Template:
1623
			return inHeadIM(p)
1624
		}
1625
	case ErrorToken:
1626
		return inBodyIM(p)
1627
	}
1628
	if p.oe.top().DataAtom != a.Colgroup {
1629
		return true
1630
	}
1631
	p.oe.pop()
1632
	p.im = inTableIM
1633
	return false
1634
}
1635

1636
// Section 12.2.6.4.13.
1637
func inTableBodyIM(p *parser) bool {
1638
	switch p.tok.Type {
1639
	case StartTagToken:
1640
		switch p.tok.DataAtom {
1641
		case a.Tr:
1642
			p.clearStackToContext(tableBodyScope)
1643
			p.addElement()
1644
			p.im = inRowIM
1645
			return true
1646
		case a.Td, a.Th:
1647
			p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1648
			return false
1649
		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1650
			if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1651
				p.im = inTableIM
1652
				return false
1653
			}
1654
			// Ignore the token.
1655
			return true
1656
		}
1657
	case EndTagToken:
1658
		switch p.tok.DataAtom {
1659
		case a.Tbody, a.Tfoot, a.Thead:
1660
			if p.elementInScope(tableScope, p.tok.DataAtom) {
1661
				p.clearStackToContext(tableBodyScope)
1662
				p.oe.pop()
1663
				p.im = inTableIM
1664
			}
1665
			return true
1666
		case a.Table:
1667
			if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1668
				p.im = inTableIM
1669
				return false
1670
			}
1671
			// Ignore the token.
1672
			return true
1673
		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1674
			// Ignore the token.
1675
			return true
1676
		}
1677
	case CommentToken:
1678
		p.addChild(&Node{
1679
			Type: CommentNode,
1680
			Data: p.tok.Data,
1681
		})
1682
		return true
1683
	}
1684

1685
	return inTableIM(p)
1686
}
1687

1688
// Section 13.2.6.4.14.
1689
func inRowIM(p *parser) bool {
1690
	switch p.tok.Type {
1691
	case StartTagToken:
1692
		switch p.tok.DataAtom {
1693
		case a.Td, a.Th:
1694
			p.clearStackToContext(tableRowScope)
1695
			p.addElement()
1696
			p.afe = append(p.afe, &scopeMarker)
1697
			p.im = inCellIM
1698
			return true
1699
		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1700
			if p.elementInScope(tableScope, a.Tr) {
1701
				p.clearStackToContext(tableRowScope)
1702
				p.oe.pop()
1703
				p.im = inTableBodyIM
1704
				return false
1705
			}
1706
			// Ignore the token.
1707
			return true
1708
		}
1709
	case EndTagToken:
1710
		switch p.tok.DataAtom {
1711
		case a.Tr:
1712
			if p.elementInScope(tableScope, a.Tr) {
1713
				p.clearStackToContext(tableRowScope)
1714
				p.oe.pop()
1715
				p.im = inTableBodyIM
1716
				return true
1717
			}
1718
			// Ignore the token.
1719
			return true
1720
		case a.Table:
1721
			if p.elementInScope(tableScope, a.Tr) {
1722
				p.clearStackToContext(tableRowScope)
1723
				p.oe.pop()
1724
				p.im = inTableBodyIM
1725
				return false
1726
			}
1727
			// Ignore the token.
1728
			return true
1729
		case a.Tbody, a.Tfoot, a.Thead:
1730
			if p.elementInScope(tableScope, p.tok.DataAtom) && p.elementInScope(tableScope, a.Tr) {
1731
				p.clearStackToContext(tableRowScope)
1732
				p.oe.pop()
1733
				p.im = inTableBodyIM
1734
				return false
1735
			}
1736
			// Ignore the token.
1737
			return true
1738
		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1739
			// Ignore the token.
1740
			return true
1741
		}
1742
	}
1743

1744
	return inTableIM(p)
1745
}
1746

1747
// Section 12.2.6.4.15.
1748
func inCellIM(p *parser) bool {
1749
	switch p.tok.Type {
1750
	case StartTagToken:
1751
		switch p.tok.DataAtom {
1752
		case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1753
			if p.popUntil(tableScope, a.Td, a.Th) {
1754
				// Close the cell and reprocess.
1755
				p.clearActiveFormattingElements()
1756
				p.im = inRowIM
1757
				return false
1758
			}
1759
			// Ignore the token.
1760
			return true
1761
		case a.Select:
1762
			p.reconstructActiveFormattingElements()
1763
			p.addElement()
1764
			p.framesetOK = false
1765
			p.im = inSelectInTableIM
1766
			return true
1767
		}
1768
	case EndTagToken:
1769
		switch p.tok.DataAtom {
1770
		case a.Td, a.Th:
1771
			if !p.popUntil(tableScope, p.tok.DataAtom) {
1772
				// Ignore the token.
1773
				return true
1774
			}
1775
			p.clearActiveFormattingElements()
1776
			p.im = inRowIM
1777
			return true
1778
		case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1779
			// Ignore the token.
1780
			return true
1781
		case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1782
			if !p.elementInScope(tableScope, p.tok.DataAtom) {
1783
				// Ignore the token.
1784
				return true
1785
			}
1786
			// Close the cell and reprocess.
1787
			if p.popUntil(tableScope, a.Td, a.Th) {
1788
				p.clearActiveFormattingElements()
1789
			}
1790
			p.im = inRowIM
1791
			return false
1792
		}
1793
	}
1794
	return inBodyIM(p)
1795
}
1796

1797
// Section 12.2.6.4.16.
1798
func inSelectIM(p *parser) bool {
1799
	switch p.tok.Type {
1800
	case TextToken:
1801
		p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1802
	case StartTagToken:
1803
		switch p.tok.DataAtom {
1804
		case a.Html:
1805
			return inBodyIM(p)
1806
		case a.Option:
1807
			if p.top().DataAtom == a.Option {
1808
				p.oe.pop()
1809
			}
1810
			p.addElement()
1811
		case a.Optgroup:
1812
			if p.top().DataAtom == a.Option {
1813
				p.oe.pop()
1814
			}
1815
			if p.top().DataAtom == a.Optgroup {
1816
				p.oe.pop()
1817
			}
1818
			p.addElement()
1819
		case a.Select:
1820
			if !p.popUntil(selectScope, a.Select) {
1821
				// Ignore the token.
1822
				return true
1823
			}
1824
			p.resetInsertionMode()
1825
		case a.Input, a.Keygen, a.Textarea:
1826
			if p.elementInScope(selectScope, a.Select) {
1827
				p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1828
				return false
1829
			}
1830
			// In order to properly ignore <textarea>, we need to change the tokenizer mode.
1831
			p.tokenizer.NextIsNotRawText()
1832
			// Ignore the token.
1833
			return true
1834
		case a.Script, a.Template:
1835
			return inHeadIM(p)
1836
		case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
1837
			// Don't let the tokenizer go into raw text mode when there are raw tags
1838
			// to be ignored. These tags should be ignored from the tokenizer
1839
			// properly.
1840
			p.tokenizer.NextIsNotRawText()
1841
			// Ignore the token.
1842
			return true
1843
		}
1844
	case EndTagToken:
1845
		switch p.tok.DataAtom {
1846
		case a.Option:
1847
			if p.top().DataAtom == a.Option {
1848
				p.oe.pop()
1849
			}
1850
		case a.Optgroup:
1851
			i := len(p.oe) - 1
1852
			if p.oe[i].DataAtom == a.Option {
1853
				i--
1854
			}
1855
			if p.oe[i].DataAtom == a.Optgroup {
1856
				p.oe = p.oe[:i]
1857
			}
1858
		case a.Select:
1859
			if !p.popUntil(selectScope, a.Select) {
1860
				// Ignore the token.
1861
				return true
1862
			}
1863
			p.resetInsertionMode()
1864
		case a.Template:
1865
			return inHeadIM(p)
1866
		}
1867
	case CommentToken:
1868
		p.addChild(&Node{
1869
			Type: CommentNode,
1870
			Data: p.tok.Data,
1871
		})
1872
	case DoctypeToken:
1873
		// Ignore the token.
1874
		return true
1875
	case ErrorToken:
1876
		return inBodyIM(p)
1877
	}
1878

1879
	return true
1880
}
1881

1882
// Section 12.2.6.4.17.
1883
func inSelectInTableIM(p *parser) bool {
1884
	switch p.tok.Type {
1885
	case StartTagToken, EndTagToken:
1886
		switch p.tok.DataAtom {
1887
		case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1888
			if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
1889
				// Ignore the token.
1890
				return true
1891
			}
1892
			// This is like p.popUntil(selectScope, a.Select), but it also
1893
			// matches <math select>, not just <select>. Matching the MathML
1894
			// tag is arguably incorrect (conceptually), but it mimics what
1895
			// Chromium does.
1896
			for i := len(p.oe) - 1; i >= 0; i-- {
1897
				if n := p.oe[i]; n.DataAtom == a.Select {
1898
					p.oe = p.oe[:i]
1899
					break
1900
				}
1901
			}
1902
			p.resetInsertionMode()
1903
			return false
1904
		}
1905
	}
1906
	return inSelectIM(p)
1907
}
1908

1909
// Section 12.2.6.4.18.
1910
func inTemplateIM(p *parser) bool {
1911
	switch p.tok.Type {
1912
	case TextToken, CommentToken, DoctypeToken:
1913
		return inBodyIM(p)
1914
	case StartTagToken:
1915
		switch p.tok.DataAtom {
1916
		case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1917
			return inHeadIM(p)
1918
		case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1919
			p.templateStack.pop()
1920
			p.templateStack = append(p.templateStack, inTableIM)
1921
			p.im = inTableIM
1922
			return false
1923
		case a.Col:
1924
			p.templateStack.pop()
1925
			p.templateStack = append(p.templateStack, inColumnGroupIM)
1926
			p.im = inColumnGroupIM
1927
			return false
1928
		case a.Tr:
1929
			p.templateStack.pop()
1930
			p.templateStack = append(p.templateStack, inTableBodyIM)
1931
			p.im = inTableBodyIM
1932
			return false
1933
		case a.Td, a.Th:
1934
			p.templateStack.pop()
1935
			p.templateStack = append(p.templateStack, inRowIM)
1936
			p.im = inRowIM
1937
			return false
1938
		default:
1939
			p.templateStack.pop()
1940
			p.templateStack = append(p.templateStack, inBodyIM)
1941
			p.im = inBodyIM
1942
			return false
1943
		}
1944
	case EndTagToken:
1945
		switch p.tok.DataAtom {
1946
		case a.Template:
1947
			return inHeadIM(p)
1948
		default:
1949
			// Ignore the token.
1950
			return true
1951
		}
1952
	case ErrorToken:
1953
		if !p.oe.contains(a.Template) {
1954
			// Ignore the token.
1955
			return true
1956
		}
1957
		// TODO: remove this divergence from the HTML5 spec.
1958
		//
1959
		// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1960
		p.generateImpliedEndTags()
1961
		for i := len(p.oe) - 1; i >= 0; i-- {
1962
			if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1963
				p.oe = p.oe[:i]
1964
				break
1965
			}
1966
		}
1967
		p.clearActiveFormattingElements()
1968
		p.templateStack.pop()
1969
		p.resetInsertionMode()
1970
		return false
1971
	}
1972
	return false
1973
}
1974

1975
// Section 12.2.6.4.19.
1976
func afterBodyIM(p *parser) bool {
1977
	switch p.tok.Type {
1978
	case ErrorToken:
1979
		// Stop parsing.
1980
		return true
1981
	case TextToken:
1982
		s := strings.TrimLeft(p.tok.Data, whitespace)
1983
		if len(s) == 0 {
1984
			// It was all whitespace.
1985
			return inBodyIM(p)
1986
		}
1987
	case StartTagToken:
1988
		if p.tok.DataAtom == a.Html {
1989
			return inBodyIM(p)
1990
		}
1991
	case EndTagToken:
1992
		if p.tok.DataAtom == a.Html {
1993
			if !p.fragment {
1994
				p.im = afterAfterBodyIM
1995
			}
1996
			return true
1997
		}
1998
	case CommentToken:
1999
		// The comment is attached to the <html> element.
2000
		if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
2001
			panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
2002
		}
2003
		p.oe[0].AppendChild(&Node{
2004
			Type: CommentNode,
2005
			Data: p.tok.Data,
2006
		})
2007
		return true
2008
	}
2009
	p.im = inBodyIM
2010
	return false
2011
}
2012

2013
// Section 12.2.6.4.20.
2014
func inFramesetIM(p *parser) bool {
2015
	switch p.tok.Type {
2016
	case CommentToken:
2017
		p.addChild(&Node{
2018
			Type: CommentNode,
2019
			Data: p.tok.Data,
2020
		})
2021
	case TextToken:
2022
		// Ignore all text but whitespace.
2023
		s := strings.Map(func(c rune) rune {
2024
			switch c {
2025
			case ' ', '\t', '\n', '\f', '\r':
2026
				return c
2027
			}
2028
			return -1
2029
		}, p.tok.Data)
2030
		if s != "" {
2031
			p.addText(s)
2032
		}
2033
	case StartTagToken:
2034
		switch p.tok.DataAtom {
2035
		case a.Html:
2036
			return inBodyIM(p)
2037
		case a.Frameset:
2038
			p.addElement()
2039
		case a.Frame:
2040
			p.addElement()
2041
			p.oe.pop()
2042
			p.acknowledgeSelfClosingTag()
2043
		case a.Noframes:
2044
			return inHeadIM(p)
2045
		}
2046
	case EndTagToken:
2047
		switch p.tok.DataAtom {
2048
		case a.Frameset:
2049
			if p.oe.top().DataAtom != a.Html {
2050
				p.oe.pop()
2051
				if p.oe.top().DataAtom != a.Frameset {
2052
					p.im = afterFramesetIM
2053
					return true
2054
				}
2055
			}
2056
		}
2057
	default:
2058
		// Ignore the token.
2059
	}
2060
	return true
2061
}
2062

2063
// Section 12.2.6.4.21.
2064
func afterFramesetIM(p *parser) bool {
2065
	switch p.tok.Type {
2066
	case CommentToken:
2067
		p.addChild(&Node{
2068
			Type: CommentNode,
2069
			Data: p.tok.Data,
2070
		})
2071
	case TextToken:
2072
		// Ignore all text but whitespace.
2073
		s := strings.Map(func(c rune) rune {
2074
			switch c {
2075
			case ' ', '\t', '\n', '\f', '\r':
2076
				return c
2077
			}
2078
			return -1
2079
		}, p.tok.Data)
2080
		if s != "" {
2081
			p.addText(s)
2082
		}
2083
	case StartTagToken:
2084
		switch p.tok.DataAtom {
2085
		case a.Html:
2086
			return inBodyIM(p)
2087
		case a.Noframes:
2088
			return inHeadIM(p)
2089
		}
2090
	case EndTagToken:
2091
		switch p.tok.DataAtom {
2092
		case a.Html:
2093
			p.im = afterAfterFramesetIM
2094
			return true
2095
		}
2096
	default:
2097
		// Ignore the token.
2098
	}
2099
	return true
2100
}
2101

2102
// Section 12.2.6.4.22.
2103
func afterAfterBodyIM(p *parser) bool {
2104
	switch p.tok.Type {
2105
	case ErrorToken:
2106
		// Stop parsing.
2107
		return true
2108
	case TextToken:
2109
		s := strings.TrimLeft(p.tok.Data, whitespace)
2110
		if len(s) == 0 {
2111
			// It was all whitespace.
2112
			return inBodyIM(p)
2113
		}
2114
	case StartTagToken:
2115
		if p.tok.DataAtom == a.Html {
2116
			return inBodyIM(p)
2117
		}
2118
	case CommentToken:
2119
		p.doc.AppendChild(&Node{
2120
			Type: CommentNode,
2121
			Data: p.tok.Data,
2122
		})
2123
		return true
2124
	case DoctypeToken:
2125
		return inBodyIM(p)
2126
	}
2127
	p.im = inBodyIM
2128
	return false
2129
}
2130

2131
// Section 12.2.6.4.23.
2132
func afterAfterFramesetIM(p *parser) bool {
2133
	switch p.tok.Type {
2134
	case CommentToken:
2135
		p.doc.AppendChild(&Node{
2136
			Type: CommentNode,
2137
			Data: p.tok.Data,
2138
		})
2139
	case TextToken:
2140
		// Ignore all text but whitespace.
2141
		s := strings.Map(func(c rune) rune {
2142
			switch c {
2143
			case ' ', '\t', '\n', '\f', '\r':
2144
				return c
2145
			}
2146
			return -1
2147
		}, p.tok.Data)
2148
		if s != "" {
2149
			p.tok.Data = s
2150
			return inBodyIM(p)
2151
		}
2152
	case StartTagToken:
2153
		switch p.tok.DataAtom {
2154
		case a.Html:
2155
			return inBodyIM(p)
2156
		case a.Noframes:
2157
			return inHeadIM(p)
2158
		}
2159
	case DoctypeToken:
2160
		return inBodyIM(p)
2161
	default:
2162
		// Ignore the token.
2163
	}
2164
	return true
2165
}
2166

2167
func ignoreTheRemainingTokens(p *parser) bool {
2168
	return true
2169
}
2170

2171
const whitespaceOrNUL = whitespace + "\x00"
2172

2173
// Section 12.2.6.5
2174
func parseForeignContent(p *parser) bool {
2175
	switch p.tok.Type {
2176
	case TextToken:
2177
		if p.framesetOK {
2178
			p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2179
		}
2180
		p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2181
		p.addText(p.tok.Data)
2182
	case CommentToken:
2183
		p.addChild(&Node{
2184
			Type: CommentNode,
2185
			Data: p.tok.Data,
2186
		})
2187
	case StartTagToken:
2188
		if !p.fragment {
2189
			b := breakout[p.tok.Data]
2190
			if p.tok.DataAtom == a.Font {
2191
			loop:
2192
				for _, attr := range p.tok.Attr {
2193
					switch attr.Key {
2194
					case "color", "face", "size":
2195
						b = true
2196
						break loop
2197
					}
2198
				}
2199
			}
2200
			if b {
2201
				for i := len(p.oe) - 1; i >= 0; i-- {
2202
					n := p.oe[i]
2203
					if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2204
						p.oe = p.oe[:i+1]
2205
						break
2206
					}
2207
				}
2208
				return false
2209
			}
2210
		}
2211
		current := p.adjustedCurrentNode()
2212
		switch current.Namespace {
2213
		case "math":
2214
			adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2215
		case "svg":
2216
			// Adjust SVG tag names. The tokenizer lower-cases tag names, but
2217
			// SVG wants e.g. "foreignObject" with a capital second "O".
2218
			if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2219
				p.tok.DataAtom = a.Lookup([]byte(x))
2220
				p.tok.Data = x
2221
			}
2222
			adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2223
		default:
2224
			panic("html: bad parser state: unexpected namespace")
2225
		}
2226
		adjustForeignAttributes(p.tok.Attr)
2227
		namespace := current.Namespace
2228
		p.addElement()
2229
		p.top().Namespace = namespace
2230
		if namespace != "" {
2231
			// Don't let the tokenizer go into raw text mode in foreign content
2232
			// (e.g. in an SVG <title> tag).
2233
			p.tokenizer.NextIsNotRawText()
2234
		}
2235
		if p.hasSelfClosingToken {
2236
			p.oe.pop()
2237
			p.acknowledgeSelfClosingTag()
2238
		}
2239
	case EndTagToken:
2240
		if strings.EqualFold(p.oe[len(p.oe)-1].Data, p.tok.Data) {
2241
			p.oe = p.oe[:len(p.oe)-1]
2242
			return true
2243
		}
2244
		for i := len(p.oe) - 1; i >= 0; i-- {
2245
			if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2246
				p.oe = p.oe[:i]
2247
				return true
2248
			}
2249
			if i > 0 && p.oe[i-1].Namespace == "" {
2250
				break
2251
			}
2252
		}
2253
		return p.im(p)
2254
	default:
2255
		// Ignore the token.
2256
	}
2257
	return true
2258
}
2259

2260
// Section 12.2.4.2.
2261
func (p *parser) adjustedCurrentNode() *Node {
2262
	if len(p.oe) == 1 && p.fragment && p.context != nil {
2263
		return p.context
2264
	}
2265
	return p.oe.top()
2266
}
2267

2268
// Section 12.2.6.
2269
func (p *parser) inForeignContent() bool {
2270
	if len(p.oe) == 0 {
2271
		return false
2272
	}
2273
	n := p.adjustedCurrentNode()
2274
	if n.Namespace == "" {
2275
		return false
2276
	}
2277
	if mathMLTextIntegrationPoint(n) {
2278
		if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2279
			return false
2280
		}
2281
		if p.tok.Type == TextToken {
2282
			return false
2283
		}
2284
	}
2285
	if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2286
		return false
2287
	}
2288
	if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2289
		return false
2290
	}
2291
	if p.tok.Type == ErrorToken {
2292
		return false
2293
	}
2294
	return true
2295
}
2296

2297
// parseImpliedToken parses a token as though it had appeared in the parser's
2298
// input.
2299
func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2300
	realToken, selfClosing := p.tok, p.hasSelfClosingToken
2301
	p.tok = Token{
2302
		Type:     t,
2303
		DataAtom: dataAtom,
2304
		Data:     data,
2305
	}
2306
	p.hasSelfClosingToken = false
2307
	p.parseCurrentToken()
2308
	p.tok, p.hasSelfClosingToken = realToken, selfClosing
2309
}
2310

2311
// parseCurrentToken runs the current token through the parsing routines
2312
// until it is consumed.
2313
func (p *parser) parseCurrentToken() {
2314
	if p.tok.Type == SelfClosingTagToken {
2315
		p.hasSelfClosingToken = true
2316
		p.tok.Type = StartTagToken
2317
	}
2318

2319
	consumed := false
2320
	for !consumed {
2321
		if p.inForeignContent() {
2322
			consumed = parseForeignContent(p)
2323
		} else {
2324
			consumed = p.im(p)
2325
		}
2326
	}
2327

2328
	if p.hasSelfClosingToken {
2329
		// This is a parse error, but ignore it.
2330
		p.hasSelfClosingToken = false
2331
	}
2332
}
2333

2334
func (p *parser) parse() (err error) {
2335
	defer func() {
2336
		if panicErr := recover(); panicErr != nil {
2337
			err = fmt.Errorf("%s", panicErr)
2338
		}
2339
	}()
2340
	// Iterate until EOF. Any other error will cause an early return.
2341
	for err != io.EOF {
2342
		// CDATA sections are allowed only in foreign content.
2343
		n := p.oe.top()
2344
		p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2345
		// Read and parse the next token.
2346
		p.tokenizer.Next()
2347
		p.tok = p.tokenizer.Token()
2348
		if p.tok.Type == ErrorToken {
2349
			err = p.tokenizer.Err()
2350
			if err != nil && err != io.EOF {
2351
				return err
2352
			}
2353
		}
2354
		p.parseCurrentToken()
2355
	}
2356
	return nil
2357
}
2358

2359
// Parse returns the parse tree for the HTML from the given Reader.
2360
//
2361
// It implements the HTML5 parsing algorithm
2362
// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2363
// which is very complicated. The resultant tree can contain implicitly created
2364
// nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2365
// differ from the nesting implied by a naive processing of start and end
2366
// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2367
// with no corresponding node in the resulting tree.
2368
//
2369
// Parse will reject HTML that is nested deeper than 512 elements.
2370
//
2371
// The input is assumed to be UTF-8 encoded.
2372
func Parse(r io.Reader) (*Node, error) {
2373
	return ParseWithOptions(r)
2374
}
2375

2376
// ParseFragment parses a fragment of HTML and returns the nodes that were
2377
// found. If the fragment is the InnerHTML for an existing element, pass that
2378
// element in context.
2379
//
2380
// It has the same intricacies as Parse.
2381
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2382
	return ParseFragmentWithOptions(r, context)
2383
}
2384

2385
// ParseOption configures a parser.
2386
type ParseOption func(p *parser)
2387

2388
// ParseOptionEnableScripting configures the scripting flag.
2389
// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
2390
//
2391
// By default, scripting is enabled.
2392
func ParseOptionEnableScripting(enable bool) ParseOption {
2393
	return func(p *parser) {
2394
		p.scripting = enable
2395
	}
2396
}
2397

2398
// ParseWithOptions is like Parse, with options.
2399
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
2400
	p := &parser{
2401
		tokenizer: NewTokenizer(r),
2402
		doc: &Node{
2403
			Type: DocumentNode,
2404
		},
2405
		scripting:  true,
2406
		framesetOK: true,
2407
		im:         initialIM,
2408
	}
2409

2410
	for _, f := range opts {
2411
		f(p)
2412
	}
2413

2414
	if err := p.parse(); err != nil {
2415
		return nil, err
2416
	}
2417
	return p.doc, nil
2418
}
2419

2420
// ParseFragmentWithOptions is like ParseFragment, with options.
2421
func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
2422
	contextTag := ""
2423
	if context != nil {
2424
		if context.Type != ElementNode {
2425
			return nil, errors.New("html: ParseFragment of non-element Node")
2426
		}
2427
		// The next check isn't just context.DataAtom.String() == context.Data because
2428
		// it is valid to pass an element whose tag isn't a known atom. For example,
2429
		// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2430
		if context.DataAtom != a.Lookup([]byte(context.Data)) {
2431
			return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2432
		}
2433
		contextTag = context.DataAtom.String()
2434
	}
2435
	p := &parser{
2436
		doc: &Node{
2437
			Type: DocumentNode,
2438
		},
2439
		scripting: true,
2440
		fragment:  true,
2441
		context:   context,
2442
	}
2443
	if context != nil && context.Namespace != "" {
2444
		p.tokenizer = NewTokenizer(r)
2445
	} else {
2446
		p.tokenizer = NewTokenizerFragment(r, contextTag)
2447
	}
2448

2449
	for _, f := range opts {
2450
		f(p)
2451
	}
2452

2453
	root := &Node{
2454
		Type:     ElementNode,
2455
		DataAtom: a.Html,
2456
		Data:     a.Html.String(),
2457
	}
2458
	p.doc.AppendChild(root)
2459
	p.oe = nodeStack{root}
2460
	if context != nil && context.DataAtom == a.Template {
2461
		p.templateStack = append(p.templateStack, inTemplateIM)
2462
	}
2463
	p.resetInsertionMode()
2464

2465
	for n := context; n != nil; n = n.Parent {
2466
		if n.Type == ElementNode && n.DataAtom == a.Form {
2467
			p.form = n
2468
			break
2469
		}
2470
	}
2471

2472
	if err := p.parse(); err != nil {
2473
		return nil, err
2474
	}
2475

2476
	parent := p.doc
2477
	if context != nil {
2478
		parent = root
2479
	}
2480

2481
	var result []*Node
2482
	for c := parent.FirstChild; c != nil; {
2483
		next := c.NextSibling
2484
		parent.RemoveChild(c)
2485
		result = append(result, c)
2486
		c = next
2487
	}
2488
	return result, nil
2489
}
2490

2491
Product

Resources

Company