Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
kardolus
GitHub Repository: kardolus/chatgpt-cli
Path: blob/main/vendor/golang.org/x/net/html/parse.go
2880 views
1
// Copyright 2010 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
package html
6
7
import (
8
"errors"
9
"fmt"
10
"io"
11
"strings"
12
13
a "golang.org/x/net/html/atom"
14
)
15
16
// A parser implements the HTML5 parsing algorithm:
17
// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
18
type parser struct {
19
// tokenizer provides the tokens for the parser.
20
tokenizer *Tokenizer
21
// tok is the most recently read token.
22
tok Token
23
// Self-closing tags like <hr/> are treated as start tags, except that
24
// hasSelfClosingToken is set while they are being processed.
25
hasSelfClosingToken bool
26
// doc is the document root element.
27
doc *Node
28
// The stack of open elements (section 12.2.4.2) and active formatting
29
// elements (section 12.2.4.3).
30
oe, afe nodeStack
31
// Element pointers (section 12.2.4.4).
32
head, form *Node
33
// Other parsing state flags (section 12.2.4.5).
34
scripting, framesetOK bool
35
// The stack of template insertion modes
36
templateStack insertionModeStack
37
// im is the current insertion mode.
38
im insertionMode
39
// originalIM is the insertion mode to go back to after completing a text
40
// or inTableText insertion mode.
41
originalIM insertionMode
42
// fosterParenting is whether new elements should be inserted according to
43
// the foster parenting rules (section 12.2.6.1).
44
fosterParenting bool
45
// quirks is whether the parser is operating in "quirks mode."
46
quirks bool
47
// fragment is whether the parser is parsing an HTML fragment.
48
fragment bool
49
// context is the context element when parsing an HTML fragment
50
// (section 12.4).
51
context *Node
52
}
53
54
func (p *parser) top() *Node {
55
if n := p.oe.top(); n != nil {
56
return n
57
}
58
return p.doc
59
}
60
61
// Stop tags for use in popUntil. These come from section 12.2.4.2.
62
var (
63
defaultScopeStopTags = map[string][]a.Atom{
64
"": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
65
"math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
66
"svg": {a.Desc, a.ForeignObject, a.Title},
67
}
68
)
69
70
type scope int
71
72
const (
73
defaultScope scope = iota
74
listItemScope
75
buttonScope
76
tableScope
77
tableRowScope
78
tableBodyScope
79
selectScope
80
)
81
82
// popUntil pops the stack of open elements at the highest element whose tag
83
// is in matchTags, provided there is no higher element in the scope's stop
84
// tags (as defined in section 12.2.4.2). It returns whether or not there was
85
// such an element. If there was not, popUntil leaves the stack unchanged.
86
//
87
// For example, the set of stop tags for table scope is: "html", "table". If
88
// the stack was:
89
// ["html", "body", "font", "table", "b", "i", "u"]
90
// then popUntil(tableScope, "font") would return false, but
91
// popUntil(tableScope, "i") would return true and the stack would become:
92
// ["html", "body", "font", "table", "b"]
93
//
94
// If an element's tag is in both the stop tags and matchTags, then the stack
95
// will be popped and the function returns true (provided, of course, there was
96
// no higher element in the stack that was also in the stop tags). For example,
97
// popUntil(tableScope, "table") returns true and leaves:
98
// ["html", "body", "font"]
99
func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
100
if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
101
p.oe = p.oe[:i]
102
return true
103
}
104
return false
105
}
106
107
// indexOfElementInScope returns the index in p.oe of the highest element whose
108
// tag is in matchTags that is in scope. If no matching element is in scope, it
109
// returns -1.
110
func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
111
for i := len(p.oe) - 1; i >= 0; i-- {
112
tagAtom := p.oe[i].DataAtom
113
if p.oe[i].Namespace == "" {
114
for _, t := range matchTags {
115
if t == tagAtom {
116
return i
117
}
118
}
119
switch s {
120
case defaultScope:
121
// No-op.
122
case listItemScope:
123
if tagAtom == a.Ol || tagAtom == a.Ul {
124
return -1
125
}
126
case buttonScope:
127
if tagAtom == a.Button {
128
return -1
129
}
130
case tableScope:
131
if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
132
return -1
133
}
134
case selectScope:
135
if tagAtom != a.Optgroup && tagAtom != a.Option {
136
return -1
137
}
138
default:
139
panic(fmt.Sprintf("html: internal error: indexOfElementInScope unknown scope: %d", s))
140
}
141
}
142
switch s {
143
case defaultScope, listItemScope, buttonScope:
144
for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
145
if t == tagAtom {
146
return -1
147
}
148
}
149
}
150
}
151
return -1
152
}
153
154
// elementInScope is like popUntil, except that it doesn't modify the stack of
155
// open elements.
156
func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
157
return p.indexOfElementInScope(s, matchTags...) != -1
158
}
159
160
// clearStackToContext pops elements off the stack of open elements until a
161
// scope-defined element is found.
162
func (p *parser) clearStackToContext(s scope) {
163
for i := len(p.oe) - 1; i >= 0; i-- {
164
tagAtom := p.oe[i].DataAtom
165
switch s {
166
case tableScope:
167
if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
168
p.oe = p.oe[:i+1]
169
return
170
}
171
case tableRowScope:
172
if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
173
p.oe = p.oe[:i+1]
174
return
175
}
176
case tableBodyScope:
177
if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
178
p.oe = p.oe[:i+1]
179
return
180
}
181
default:
182
panic(fmt.Sprintf("html: internal error: clearStackToContext unknown scope: %d", s))
183
}
184
}
185
}
186
187
// parseGenericRawTextElement implements the generic raw text element parsing
188
// algorithm defined in 12.2.6.2.
189
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
190
// TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
191
// officially, need to make tokenizer consider both states.
192
func (p *parser) parseGenericRawTextElement() {
193
p.addElement()
194
p.originalIM = p.im
195
p.im = textIM
196
}
197
198
// generateImpliedEndTags pops nodes off the stack of open elements as long as
199
// the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
200
// If exceptions are specified, nodes with that name will not be popped off.
201
func (p *parser) generateImpliedEndTags(exceptions ...string) {
202
var i int
203
loop:
204
for i = len(p.oe) - 1; i >= 0; i-- {
205
n := p.oe[i]
206
if n.Type != ElementNode {
207
break
208
}
209
switch n.DataAtom {
210
case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
211
for _, except := range exceptions {
212
if n.Data == except {
213
break loop
214
}
215
}
216
continue
217
}
218
break
219
}
220
221
p.oe = p.oe[:i+1]
222
}
223
224
// addChild adds a child node n to the top element, and pushes n onto the stack
225
// of open elements if it is an element node.
226
func (p *parser) addChild(n *Node) {
227
if p.shouldFosterParent() {
228
p.fosterParent(n)
229
} else {
230
p.top().AppendChild(n)
231
}
232
233
if n.Type == ElementNode {
234
p.insertOpenElement(n)
235
}
236
}
237
238
func (p *parser) insertOpenElement(n *Node) {
239
p.oe = append(p.oe, n)
240
if len(p.oe) > 512 {
241
panic("html: open stack of elements exceeds 512 nodes")
242
}
243
}
244
245
// shouldFosterParent returns whether the next node to be added should be
246
// foster parented.
247
func (p *parser) shouldFosterParent() bool {
248
if p.fosterParenting {
249
switch p.top().DataAtom {
250
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
251
return true
252
}
253
}
254
return false
255
}
256
257
// fosterParent adds a child node according to the foster parenting rules.
258
// Section 12.2.6.1, "foster parenting".
259
func (p *parser) fosterParent(n *Node) {
260
var table, parent, prev, template *Node
261
var i int
262
for i = len(p.oe) - 1; i >= 0; i-- {
263
if p.oe[i].DataAtom == a.Table {
264
table = p.oe[i]
265
break
266
}
267
}
268
269
var j int
270
for j = len(p.oe) - 1; j >= 0; j-- {
271
if p.oe[j].DataAtom == a.Template {
272
template = p.oe[j]
273
break
274
}
275
}
276
277
if template != nil && (table == nil || j > i) {
278
template.AppendChild(n)
279
return
280
}
281
282
if table == nil {
283
// The foster parent is the html element.
284
parent = p.oe[0]
285
} else {
286
parent = table.Parent
287
}
288
if parent == nil {
289
parent = p.oe[i-1]
290
}
291
292
if table != nil {
293
prev = table.PrevSibling
294
} else {
295
prev = parent.LastChild
296
}
297
if prev != nil && prev.Type == TextNode && n.Type == TextNode {
298
prev.Data += n.Data
299
return
300
}
301
302
parent.InsertBefore(n, table)
303
}
304
305
// addText adds text to the preceding node if it is a text node, or else it
306
// calls addChild with a new text node.
307
func (p *parser) addText(text string) {
308
if text == "" {
309
return
310
}
311
312
if p.shouldFosterParent() {
313
p.fosterParent(&Node{
314
Type: TextNode,
315
Data: text,
316
})
317
return
318
}
319
320
t := p.top()
321
if n := t.LastChild; n != nil && n.Type == TextNode {
322
n.Data += text
323
return
324
}
325
p.addChild(&Node{
326
Type: TextNode,
327
Data: text,
328
})
329
}
330
331
// addElement adds a child element based on the current token.
332
func (p *parser) addElement() {
333
p.addChild(&Node{
334
Type: ElementNode,
335
DataAtom: p.tok.DataAtom,
336
Data: p.tok.Data,
337
Attr: p.tok.Attr,
338
})
339
}
340
341
// Section 12.2.4.3.
342
func (p *parser) addFormattingElement() {
343
tagAtom, attr := p.tok.DataAtom, p.tok.Attr
344
p.addElement()
345
346
// Implement the Noah's Ark clause, but with three per family instead of two.
347
identicalElements := 0
348
findIdenticalElements:
349
for i := len(p.afe) - 1; i >= 0; i-- {
350
n := p.afe[i]
351
if n.Type == scopeMarkerNode {
352
break
353
}
354
if n.Type != ElementNode {
355
continue
356
}
357
if n.Namespace != "" {
358
continue
359
}
360
if n.DataAtom != tagAtom {
361
continue
362
}
363
if len(n.Attr) != len(attr) {
364
continue
365
}
366
compareAttributes:
367
for _, t0 := range n.Attr {
368
for _, t1 := range attr {
369
if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
370
// Found a match for this attribute, continue with the next attribute.
371
continue compareAttributes
372
}
373
}
374
// If we get here, there is no attribute that matches a.
375
// Therefore the element is not identical to the new one.
376
continue findIdenticalElements
377
}
378
379
identicalElements++
380
if identicalElements >= 3 {
381
p.afe.remove(n)
382
}
383
}
384
385
p.afe = append(p.afe, p.top())
386
}
387
388
// Section 12.2.4.3.
389
func (p *parser) clearActiveFormattingElements() {
390
for {
391
if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
392
return
393
}
394
}
395
}
396
397
// Section 12.2.4.3.
398
func (p *parser) reconstructActiveFormattingElements() {
399
n := p.afe.top()
400
if n == nil {
401
return
402
}
403
if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
404
return
405
}
406
i := len(p.afe) - 1
407
for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
408
if i == 0 {
409
i = -1
410
break
411
}
412
i--
413
n = p.afe[i]
414
}
415
for {
416
i++
417
clone := p.afe[i].clone()
418
p.addChild(clone)
419
p.afe[i] = clone
420
if i == len(p.afe)-1 {
421
break
422
}
423
}
424
}
425
426
// Section 12.2.5.
427
func (p *parser) acknowledgeSelfClosingTag() {
428
p.hasSelfClosingToken = false
429
}
430
431
// An insertion mode (section 12.2.4.1) is the state transition function from
432
// a particular state in the HTML5 parser's state machine. It updates the
433
// parser's fields depending on parser.tok (where ErrorToken means EOF).
434
// It returns whether the token was consumed.
435
type insertionMode func(*parser) bool
436
437
// setOriginalIM sets the insertion mode to return to after completing a text or
438
// inTableText insertion mode.
439
// Section 12.2.4.1, "using the rules for".
440
func (p *parser) setOriginalIM() {
441
if p.originalIM != nil {
442
panic("html: bad parser state: originalIM was set twice")
443
}
444
p.originalIM = p.im
445
}
446
447
// Section 12.2.4.1, "reset the insertion mode".
448
func (p *parser) resetInsertionMode() {
449
for i := len(p.oe) - 1; i >= 0; i-- {
450
n := p.oe[i]
451
last := i == 0
452
if last && p.context != nil {
453
n = p.context
454
}
455
456
switch n.DataAtom {
457
case a.Select:
458
if !last {
459
for ancestor, first := n, p.oe[0]; ancestor != first; {
460
ancestor = p.oe[p.oe.index(ancestor)-1]
461
switch ancestor.DataAtom {
462
case a.Template:
463
p.im = inSelectIM
464
return
465
case a.Table:
466
p.im = inSelectInTableIM
467
return
468
}
469
}
470
}
471
p.im = inSelectIM
472
case a.Td, a.Th:
473
// TODO: remove this divergence from the HTML5 spec.
474
//
475
// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
476
p.im = inCellIM
477
case a.Tr:
478
p.im = inRowIM
479
case a.Tbody, a.Thead, a.Tfoot:
480
p.im = inTableBodyIM
481
case a.Caption:
482
p.im = inCaptionIM
483
case a.Colgroup:
484
p.im = inColumnGroupIM
485
case a.Table:
486
p.im = inTableIM
487
case a.Template:
488
// TODO: remove this divergence from the HTML5 spec.
489
if n.Namespace != "" {
490
continue
491
}
492
p.im = p.templateStack.top()
493
case a.Head:
494
// TODO: remove this divergence from the HTML5 spec.
495
//
496
// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
497
p.im = inHeadIM
498
case a.Body:
499
p.im = inBodyIM
500
case a.Frameset:
501
p.im = inFramesetIM
502
case a.Html:
503
if p.head == nil {
504
p.im = beforeHeadIM
505
} else {
506
p.im = afterHeadIM
507
}
508
default:
509
if last {
510
p.im = inBodyIM
511
return
512
}
513
continue
514
}
515
return
516
}
517
}
518
519
const whitespace = " \t\r\n\f"
520
521
// Section 12.2.6.4.1.
522
func initialIM(p *parser) bool {
523
switch p.tok.Type {
524
case TextToken:
525
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
526
if len(p.tok.Data) == 0 {
527
// It was all whitespace, so ignore it.
528
return true
529
}
530
case CommentToken:
531
p.doc.AppendChild(&Node{
532
Type: CommentNode,
533
Data: p.tok.Data,
534
})
535
return true
536
case DoctypeToken:
537
n, quirks := parseDoctype(p.tok.Data)
538
p.doc.AppendChild(n)
539
p.quirks = quirks
540
p.im = beforeHTMLIM
541
return true
542
}
543
p.quirks = true
544
p.im = beforeHTMLIM
545
return false
546
}
547
548
// Section 12.2.6.4.2.
549
func beforeHTMLIM(p *parser) bool {
550
switch p.tok.Type {
551
case DoctypeToken:
552
// Ignore the token.
553
return true
554
case TextToken:
555
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
556
if len(p.tok.Data) == 0 {
557
// It was all whitespace, so ignore it.
558
return true
559
}
560
case StartTagToken:
561
if p.tok.DataAtom == a.Html {
562
p.addElement()
563
p.im = beforeHeadIM
564
return true
565
}
566
case EndTagToken:
567
switch p.tok.DataAtom {
568
case a.Head, a.Body, a.Html, a.Br:
569
p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
570
return false
571
default:
572
// Ignore the token.
573
return true
574
}
575
case CommentToken:
576
p.doc.AppendChild(&Node{
577
Type: CommentNode,
578
Data: p.tok.Data,
579
})
580
return true
581
}
582
p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
583
return false
584
}
585
586
// Section 12.2.6.4.3.
587
func beforeHeadIM(p *parser) bool {
588
switch p.tok.Type {
589
case TextToken:
590
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
591
if len(p.tok.Data) == 0 {
592
// It was all whitespace, so ignore it.
593
return true
594
}
595
case StartTagToken:
596
switch p.tok.DataAtom {
597
case a.Head:
598
p.addElement()
599
p.head = p.top()
600
p.im = inHeadIM
601
return true
602
case a.Html:
603
return inBodyIM(p)
604
}
605
case EndTagToken:
606
switch p.tok.DataAtom {
607
case a.Head, a.Body, a.Html, a.Br:
608
p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
609
return false
610
default:
611
// Ignore the token.
612
return true
613
}
614
case CommentToken:
615
p.addChild(&Node{
616
Type: CommentNode,
617
Data: p.tok.Data,
618
})
619
return true
620
case DoctypeToken:
621
// Ignore the token.
622
return true
623
}
624
625
p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
626
return false
627
}
628
629
// Section 12.2.6.4.4.
630
func inHeadIM(p *parser) bool {
631
switch p.tok.Type {
632
case TextToken:
633
s := strings.TrimLeft(p.tok.Data, whitespace)
634
if len(s) < len(p.tok.Data) {
635
// Add the initial whitespace to the current node.
636
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
637
if s == "" {
638
return true
639
}
640
p.tok.Data = s
641
}
642
case StartTagToken:
643
switch p.tok.DataAtom {
644
case a.Html:
645
return inBodyIM(p)
646
case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
647
p.addElement()
648
p.oe.pop()
649
p.acknowledgeSelfClosingTag()
650
return true
651
case a.Noscript:
652
if p.scripting {
653
p.parseGenericRawTextElement()
654
return true
655
}
656
p.addElement()
657
p.im = inHeadNoscriptIM
658
// Don't let the tokenizer go into raw text mode when scripting is disabled.
659
p.tokenizer.NextIsNotRawText()
660
return true
661
case a.Script, a.Title:
662
p.addElement()
663
p.setOriginalIM()
664
p.im = textIM
665
return true
666
case a.Noframes, a.Style:
667
p.parseGenericRawTextElement()
668
return true
669
case a.Head:
670
// Ignore the token.
671
return true
672
case a.Template:
673
// TODO: remove this divergence from the HTML5 spec.
674
//
675
// We don't handle all of the corner cases when mixing foreign
676
// content (i.e. <math> or <svg>) with <template>. Without this
677
// early return, we can get into an infinite loop, possibly because
678
// of the "TODO... further divergence" a little below.
679
//
680
// As a workaround, if we are mixing foreign content and templates,
681
// just ignore the rest of the HTML. Foreign content is rare and a
682
// relatively old HTML feature. Templates are also rare and a
683
// relatively new HTML feature. Their combination is very rare.
684
for _, e := range p.oe {
685
if e.Namespace != "" {
686
p.im = ignoreTheRemainingTokens
687
return true
688
}
689
}
690
691
p.addElement()
692
p.afe = append(p.afe, &scopeMarker)
693
p.framesetOK = false
694
p.im = inTemplateIM
695
p.templateStack = append(p.templateStack, inTemplateIM)
696
return true
697
}
698
case EndTagToken:
699
switch p.tok.DataAtom {
700
case a.Head:
701
p.oe.pop()
702
p.im = afterHeadIM
703
return true
704
case a.Body, a.Html, a.Br:
705
p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
706
return false
707
case a.Template:
708
if !p.oe.contains(a.Template) {
709
return true
710
}
711
// TODO: remove this further divergence from the HTML5 spec.
712
//
713
// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
714
p.generateImpliedEndTags()
715
for i := len(p.oe) - 1; i >= 0; i-- {
716
if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
717
p.oe = p.oe[:i]
718
break
719
}
720
}
721
p.clearActiveFormattingElements()
722
p.templateStack.pop()
723
p.resetInsertionMode()
724
return true
725
default:
726
// Ignore the token.
727
return true
728
}
729
case CommentToken:
730
p.addChild(&Node{
731
Type: CommentNode,
732
Data: p.tok.Data,
733
})
734
return true
735
case DoctypeToken:
736
// Ignore the token.
737
return true
738
}
739
740
p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
741
return false
742
}
743
744
// Section 12.2.6.4.5.
745
func inHeadNoscriptIM(p *parser) bool {
746
switch p.tok.Type {
747
case DoctypeToken:
748
// Ignore the token.
749
return true
750
case StartTagToken:
751
switch p.tok.DataAtom {
752
case a.Html:
753
return inBodyIM(p)
754
case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
755
return inHeadIM(p)
756
case a.Head:
757
// Ignore the token.
758
return true
759
case a.Noscript:
760
// Don't let the tokenizer go into raw text mode even when a <noscript>
761
// tag is in "in head noscript" insertion mode.
762
p.tokenizer.NextIsNotRawText()
763
// Ignore the token.
764
return true
765
}
766
case EndTagToken:
767
switch p.tok.DataAtom {
768
case a.Noscript, a.Br:
769
default:
770
// Ignore the token.
771
return true
772
}
773
case TextToken:
774
s := strings.TrimLeft(p.tok.Data, whitespace)
775
if len(s) == 0 {
776
// It was all whitespace.
777
return inHeadIM(p)
778
}
779
case CommentToken:
780
return inHeadIM(p)
781
}
782
p.oe.pop()
783
if p.top().DataAtom != a.Head {
784
panic("html: the new current node will be a head element.")
785
}
786
p.im = inHeadIM
787
if p.tok.DataAtom == a.Noscript {
788
return true
789
}
790
return false
791
}
792
793
// Section 12.2.6.4.6.
794
func afterHeadIM(p *parser) bool {
795
switch p.tok.Type {
796
case TextToken:
797
s := strings.TrimLeft(p.tok.Data, whitespace)
798
if len(s) < len(p.tok.Data) {
799
// Add the initial whitespace to the current node.
800
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
801
if s == "" {
802
return true
803
}
804
p.tok.Data = s
805
}
806
case StartTagToken:
807
switch p.tok.DataAtom {
808
case a.Html:
809
return inBodyIM(p)
810
case a.Body:
811
p.addElement()
812
p.framesetOK = false
813
p.im = inBodyIM
814
return true
815
case a.Frameset:
816
p.addElement()
817
p.im = inFramesetIM
818
return true
819
case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
820
p.insertOpenElement(p.head)
821
defer p.oe.remove(p.head)
822
return inHeadIM(p)
823
case a.Head:
824
// Ignore the token.
825
return true
826
}
827
case EndTagToken:
828
switch p.tok.DataAtom {
829
case a.Body, a.Html, a.Br:
830
// Drop down to creating an implied <body> tag.
831
case a.Template:
832
return inHeadIM(p)
833
default:
834
// Ignore the token.
835
return true
836
}
837
case CommentToken:
838
p.addChild(&Node{
839
Type: CommentNode,
840
Data: p.tok.Data,
841
})
842
return true
843
case DoctypeToken:
844
// Ignore the token.
845
return true
846
}
847
848
p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
849
p.framesetOK = true
850
if p.tok.Type == ErrorToken {
851
// Stop parsing.
852
return true
853
}
854
return false
855
}
856
857
// copyAttributes copies attributes of src not found on dst to dst.
858
func copyAttributes(dst *Node, src Token) {
859
if len(src.Attr) == 0 {
860
return
861
}
862
attr := map[string]string{}
863
for _, t := range dst.Attr {
864
attr[t.Key] = t.Val
865
}
866
for _, t := range src.Attr {
867
if _, ok := attr[t.Key]; !ok {
868
dst.Attr = append(dst.Attr, t)
869
attr[t.Key] = t.Val
870
}
871
}
872
}
873
874
// Section 12.2.6.4.7.
875
func inBodyIM(p *parser) bool {
876
switch p.tok.Type {
877
case TextToken:
878
d := p.tok.Data
879
switch n := p.oe.top(); n.DataAtom {
880
case a.Pre, a.Listing:
881
if n.FirstChild == nil {
882
// Ignore a newline at the start of a <pre> block.
883
if d != "" && d[0] == '\r' {
884
d = d[1:]
885
}
886
if d != "" && d[0] == '\n' {
887
d = d[1:]
888
}
889
}
890
}
891
d = strings.Replace(d, "\x00", "", -1)
892
if d == "" {
893
return true
894
}
895
p.reconstructActiveFormattingElements()
896
p.addText(d)
897
if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
898
// There were non-whitespace characters inserted.
899
p.framesetOK = false
900
}
901
case StartTagToken:
902
switch p.tok.DataAtom {
903
case a.Html:
904
if p.oe.contains(a.Template) {
905
return true
906
}
907
copyAttributes(p.oe[0], p.tok)
908
case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
909
return inHeadIM(p)
910
case a.Body:
911
if p.oe.contains(a.Template) {
912
return true
913
}
914
if len(p.oe) >= 2 {
915
body := p.oe[1]
916
if body.Type == ElementNode && body.DataAtom == a.Body {
917
p.framesetOK = false
918
copyAttributes(body, p.tok)
919
}
920
}
921
case a.Frameset:
922
if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
923
// Ignore the token.
924
return true
925
}
926
body := p.oe[1]
927
if body.Parent != nil {
928
body.Parent.RemoveChild(body)
929
}
930
p.oe = p.oe[:1]
931
p.addElement()
932
p.im = inFramesetIM
933
return true
934
case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:
935
p.popUntil(buttonScope, a.P)
936
p.addElement()
937
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
938
p.popUntil(buttonScope, a.P)
939
switch n := p.top(); n.DataAtom {
940
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
941
p.oe.pop()
942
}
943
p.addElement()
944
case a.Pre, a.Listing:
945
p.popUntil(buttonScope, a.P)
946
p.addElement()
947
// The newline, if any, will be dealt with by the TextToken case.
948
p.framesetOK = false
949
case a.Form:
950
if p.form != nil && !p.oe.contains(a.Template) {
951
// Ignore the token
952
return true
953
}
954
p.popUntil(buttonScope, a.P)
955
p.addElement()
956
if !p.oe.contains(a.Template) {
957
p.form = p.top()
958
}
959
case a.Li:
960
p.framesetOK = false
961
for i := len(p.oe) - 1; i >= 0; i-- {
962
node := p.oe[i]
963
switch node.DataAtom {
964
case a.Li:
965
p.oe = p.oe[:i]
966
case a.Address, a.Div, a.P:
967
continue
968
default:
969
if !isSpecialElement(node) {
970
continue
971
}
972
}
973
break
974
}
975
p.popUntil(buttonScope, a.P)
976
p.addElement()
977
case a.Dd, a.Dt:
978
p.framesetOK = false
979
for i := len(p.oe) - 1; i >= 0; i-- {
980
node := p.oe[i]
981
switch node.DataAtom {
982
case a.Dd, a.Dt:
983
p.oe = p.oe[:i]
984
case a.Address, a.Div, a.P:
985
continue
986
default:
987
if !isSpecialElement(node) {
988
continue
989
}
990
}
991
break
992
}
993
p.popUntil(buttonScope, a.P)
994
p.addElement()
995
case a.Plaintext:
996
p.popUntil(buttonScope, a.P)
997
p.addElement()
998
case a.Button:
999
p.popUntil(defaultScope, a.Button)
1000
p.reconstructActiveFormattingElements()
1001
p.addElement()
1002
p.framesetOK = false
1003
case a.A:
1004
for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
1005
if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
1006
p.inBodyEndTagFormatting(a.A, "a")
1007
p.oe.remove(n)
1008
p.afe.remove(n)
1009
break
1010
}
1011
}
1012
p.reconstructActiveFormattingElements()
1013
p.addFormattingElement()
1014
case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1015
p.reconstructActiveFormattingElements()
1016
p.addFormattingElement()
1017
case a.Nobr:
1018
p.reconstructActiveFormattingElements()
1019
if p.elementInScope(defaultScope, a.Nobr) {
1020
p.inBodyEndTagFormatting(a.Nobr, "nobr")
1021
p.reconstructActiveFormattingElements()
1022
}
1023
p.addFormattingElement()
1024
case a.Applet, a.Marquee, a.Object:
1025
p.reconstructActiveFormattingElements()
1026
p.addElement()
1027
p.afe = append(p.afe, &scopeMarker)
1028
p.framesetOK = false
1029
case a.Table:
1030
if !p.quirks {
1031
p.popUntil(buttonScope, a.P)
1032
}
1033
p.addElement()
1034
p.framesetOK = false
1035
p.im = inTableIM
1036
return true
1037
case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
1038
p.reconstructActiveFormattingElements()
1039
p.addElement()
1040
p.oe.pop()
1041
p.acknowledgeSelfClosingTag()
1042
if p.tok.DataAtom == a.Input {
1043
for _, t := range p.tok.Attr {
1044
if t.Key == "type" {
1045
if strings.EqualFold(t.Val, "hidden") {
1046
// Skip setting framesetOK = false
1047
return true
1048
}
1049
}
1050
}
1051
}
1052
p.framesetOK = false
1053
case a.Param, a.Source, a.Track:
1054
p.addElement()
1055
p.oe.pop()
1056
p.acknowledgeSelfClosingTag()
1057
case a.Hr:
1058
p.popUntil(buttonScope, a.P)
1059
p.addElement()
1060
p.oe.pop()
1061
p.acknowledgeSelfClosingTag()
1062
p.framesetOK = false
1063
case a.Image:
1064
p.tok.DataAtom = a.Img
1065
p.tok.Data = a.Img.String()
1066
return false
1067
case a.Textarea:
1068
p.addElement()
1069
p.setOriginalIM()
1070
p.framesetOK = false
1071
p.im = textIM
1072
case a.Xmp:
1073
p.popUntil(buttonScope, a.P)
1074
p.reconstructActiveFormattingElements()
1075
p.framesetOK = false
1076
p.parseGenericRawTextElement()
1077
case a.Iframe:
1078
p.framesetOK = false
1079
p.parseGenericRawTextElement()
1080
case a.Noembed:
1081
p.parseGenericRawTextElement()
1082
case a.Noscript:
1083
if p.scripting {
1084
p.parseGenericRawTextElement()
1085
return true
1086
}
1087
p.reconstructActiveFormattingElements()
1088
p.addElement()
1089
// Don't let the tokenizer go into raw text mode when scripting is disabled.
1090
p.tokenizer.NextIsNotRawText()
1091
case a.Select:
1092
p.reconstructActiveFormattingElements()
1093
p.addElement()
1094
p.framesetOK = false
1095
p.im = inSelectIM
1096
return true
1097
case a.Optgroup, a.Option:
1098
if p.top().DataAtom == a.Option {
1099
p.oe.pop()
1100
}
1101
p.reconstructActiveFormattingElements()
1102
p.addElement()
1103
case a.Rb, a.Rtc:
1104
if p.elementInScope(defaultScope, a.Ruby) {
1105
p.generateImpliedEndTags()
1106
}
1107
p.addElement()
1108
case a.Rp, a.Rt:
1109
if p.elementInScope(defaultScope, a.Ruby) {
1110
p.generateImpliedEndTags("rtc")
1111
}
1112
p.addElement()
1113
case a.Math, a.Svg:
1114
p.reconstructActiveFormattingElements()
1115
if p.tok.DataAtom == a.Math {
1116
adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
1117
} else {
1118
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
1119
}
1120
adjustForeignAttributes(p.tok.Attr)
1121
p.addElement()
1122
p.top().Namespace = p.tok.Data
1123
if p.hasSelfClosingToken {
1124
p.oe.pop()
1125
p.acknowledgeSelfClosingTag()
1126
}
1127
return true
1128
case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1129
// Ignore the token.
1130
default:
1131
p.reconstructActiveFormattingElements()
1132
p.addElement()
1133
}
1134
case EndTagToken:
1135
switch p.tok.DataAtom {
1136
case a.Body:
1137
if p.elementInScope(defaultScope, a.Body) {
1138
p.im = afterBodyIM
1139
}
1140
case a.Html:
1141
if p.elementInScope(defaultScope, a.Body) {
1142
p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
1143
return false
1144
}
1145
return true
1146
case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:
1147
p.popUntil(defaultScope, p.tok.DataAtom)
1148
case a.Form:
1149
if p.oe.contains(a.Template) {
1150
i := p.indexOfElementInScope(defaultScope, a.Form)
1151
if i == -1 {
1152
// Ignore the token.
1153
return true
1154
}
1155
p.generateImpliedEndTags()
1156
if p.oe[i].DataAtom != a.Form {
1157
// Ignore the token.
1158
return true
1159
}
1160
p.popUntil(defaultScope, a.Form)
1161
} else {
1162
node := p.form
1163
p.form = nil
1164
i := p.indexOfElementInScope(defaultScope, a.Form)
1165
if node == nil || i == -1 || p.oe[i] != node {
1166
// Ignore the token.
1167
return true
1168
}
1169
p.generateImpliedEndTags()
1170
p.oe.remove(node)
1171
}
1172
case a.P:
1173
if !p.elementInScope(buttonScope, a.P) {
1174
p.parseImpliedToken(StartTagToken, a.P, a.P.String())
1175
}
1176
p.popUntil(buttonScope, a.P)
1177
case a.Li:
1178
p.popUntil(listItemScope, a.Li)
1179
case a.Dd, a.Dt:
1180
p.popUntil(defaultScope, p.tok.DataAtom)
1181
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
1182
p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
1183
case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
1184
p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
1185
case a.Applet, a.Marquee, a.Object:
1186
if p.popUntil(defaultScope, p.tok.DataAtom) {
1187
p.clearActiveFormattingElements()
1188
}
1189
case a.Br:
1190
p.tok.Type = StartTagToken
1191
return false
1192
case a.Template:
1193
return inHeadIM(p)
1194
default:
1195
p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
1196
}
1197
case CommentToken:
1198
p.addChild(&Node{
1199
Type: CommentNode,
1200
Data: p.tok.Data,
1201
})
1202
case ErrorToken:
1203
// TODO: remove this divergence from the HTML5 spec.
1204
if len(p.templateStack) > 0 {
1205
p.im = inTemplateIM
1206
return false
1207
}
1208
for _, e := range p.oe {
1209
switch e.DataAtom {
1210
case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
1211
a.Thead, a.Tr, a.Body, a.Html:
1212
default:
1213
return true
1214
}
1215
}
1216
}
1217
1218
return true
1219
}
1220
1221
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
1222
// This is the "adoption agency" algorithm, described at
1223
// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
1224
1225
// TODO: this is a fairly literal line-by-line translation of that algorithm.
1226
// Once the code successfully parses the comprehensive test suite, we should
1227
// refactor this code to be more idiomatic.
1228
1229
// Steps 1-2
1230
if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
1231
p.oe.pop()
1232
return
1233
}
1234
1235
// Steps 3-5. The outer loop.
1236
for i := 0; i < 8; i++ {
1237
// Step 6. Find the formatting element.
1238
var formattingElement *Node
1239
for j := len(p.afe) - 1; j >= 0; j-- {
1240
if p.afe[j].Type == scopeMarkerNode {
1241
break
1242
}
1243
if p.afe[j].DataAtom == tagAtom {
1244
formattingElement = p.afe[j]
1245
break
1246
}
1247
}
1248
if formattingElement == nil {
1249
p.inBodyEndTagOther(tagAtom, tagName)
1250
return
1251
}
1252
1253
// Step 7. Ignore the tag if formatting element is not in the stack of open elements.
1254
feIndex := p.oe.index(formattingElement)
1255
if feIndex == -1 {
1256
p.afe.remove(formattingElement)
1257
return
1258
}
1259
// Step 8. Ignore the tag if formatting element is not in the scope.
1260
if !p.elementInScope(defaultScope, tagAtom) {
1261
// Ignore the tag.
1262
return
1263
}
1264
1265
// Step 9. This step is omitted because it's just a parse error but no need to return.
1266
1267
// Steps 10-11. Find the furthest block.
1268
var furthestBlock *Node
1269
for _, e := range p.oe[feIndex:] {
1270
if isSpecialElement(e) {
1271
furthestBlock = e
1272
break
1273
}
1274
}
1275
if furthestBlock == nil {
1276
e := p.oe.pop()
1277
for e != formattingElement {
1278
e = p.oe.pop()
1279
}
1280
p.afe.remove(e)
1281
return
1282
}
1283
1284
// Steps 12-13. Find the common ancestor and bookmark node.
1285
commonAncestor := p.oe[feIndex-1]
1286
bookmark := p.afe.index(formattingElement)
1287
1288
// Step 14. The inner loop. Find the lastNode to reparent.
1289
lastNode := furthestBlock
1290
node := furthestBlock
1291
x := p.oe.index(node)
1292
// Step 14.1.
1293
j := 0
1294
for {
1295
// Step 14.2.
1296
j++
1297
// Step. 14.3.
1298
x--
1299
node = p.oe[x]
1300
// Step 14.4. Go to the next step if node is formatting element.
1301
if node == formattingElement {
1302
break
1303
}
1304
// Step 14.5. Remove node from the list of active formatting elements if
1305
// inner loop counter is greater than three and node is in the list of
1306
// active formatting elements.
1307
if ni := p.afe.index(node); j > 3 && ni > -1 {
1308
p.afe.remove(node)
1309
// If any element of the list of active formatting elements is removed,
1310
// we need to take care whether bookmark should be decremented or not.
1311
// This is because the value of bookmark may exceed the size of the
1312
// list by removing elements from the list.
1313
if ni <= bookmark {
1314
bookmark--
1315
}
1316
continue
1317
}
1318
// Step 14.6. Continue the next inner loop if node is not in the list of
1319
// active formatting elements.
1320
if p.afe.index(node) == -1 {
1321
p.oe.remove(node)
1322
continue
1323
}
1324
// Step 14.7.
1325
clone := node.clone()
1326
p.afe[p.afe.index(node)] = clone
1327
p.oe[p.oe.index(node)] = clone
1328
node = clone
1329
// Step 14.8.
1330
if lastNode == furthestBlock {
1331
bookmark = p.afe.index(node) + 1
1332
}
1333
// Step 14.9.
1334
if lastNode.Parent != nil {
1335
lastNode.Parent.RemoveChild(lastNode)
1336
}
1337
node.AppendChild(lastNode)
1338
// Step 14.10.
1339
lastNode = node
1340
}
1341
1342
// Step 15. Reparent lastNode to the common ancestor,
1343
// or for misnested table nodes, to the foster parent.
1344
if lastNode.Parent != nil {
1345
lastNode.Parent.RemoveChild(lastNode)
1346
}
1347
switch commonAncestor.DataAtom {
1348
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1349
p.fosterParent(lastNode)
1350
default:
1351
commonAncestor.AppendChild(lastNode)
1352
}
1353
1354
// Steps 16-18. Reparent nodes from the furthest block's children
1355
// to a clone of the formatting element.
1356
clone := formattingElement.clone()
1357
reparentChildren(clone, furthestBlock)
1358
furthestBlock.AppendChild(clone)
1359
1360
// Step 19. Fix up the list of active formatting elements.
1361
if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
1362
// Move the bookmark with the rest of the list.
1363
bookmark--
1364
}
1365
p.afe.remove(formattingElement)
1366
p.afe.insert(bookmark, clone)
1367
1368
// Step 20. Fix up the stack of open elements.
1369
p.oe.remove(formattingElement)
1370
p.oe.insert(p.oe.index(furthestBlock)+1, clone)
1371
}
1372
}
1373
1374
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
1375
// "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
1376
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
1377
func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
1378
for i := len(p.oe) - 1; i >= 0; i-- {
1379
// Two element nodes have the same tag if they have the same Data (a
1380
// string-typed field). As an optimization, for common HTML tags, each
1381
// Data string is assigned a unique, non-zero DataAtom (a uint32-typed
1382
// field), since integer comparison is faster than string comparison.
1383
// Uncommon (custom) tags get a zero DataAtom.
1384
//
1385
// The if condition here is equivalent to (p.oe[i].Data == tagName).
1386
if (p.oe[i].DataAtom == tagAtom) &&
1387
((tagAtom != 0) || (p.oe[i].Data == tagName)) {
1388
p.oe = p.oe[:i]
1389
break
1390
}
1391
if isSpecialElement(p.oe[i]) {
1392
break
1393
}
1394
}
1395
}
1396
1397
// Section 12.2.6.4.8.
1398
func textIM(p *parser) bool {
1399
switch p.tok.Type {
1400
case ErrorToken:
1401
p.oe.pop()
1402
case TextToken:
1403
d := p.tok.Data
1404
if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
1405
// Ignore a newline at the start of a <textarea> block.
1406
if d != "" && d[0] == '\r' {
1407
d = d[1:]
1408
}
1409
if d != "" && d[0] == '\n' {
1410
d = d[1:]
1411
}
1412
}
1413
if d == "" {
1414
return true
1415
}
1416
p.addText(d)
1417
return true
1418
case EndTagToken:
1419
p.oe.pop()
1420
}
1421
p.im = p.originalIM
1422
p.originalIM = nil
1423
return p.tok.Type == EndTagToken
1424
}
1425
1426
// Section 12.2.6.4.9.
1427
func inTableIM(p *parser) bool {
1428
switch p.tok.Type {
1429
case TextToken:
1430
p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
1431
switch p.oe.top().DataAtom {
1432
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1433
if strings.Trim(p.tok.Data, whitespace) == "" {
1434
p.addText(p.tok.Data)
1435
return true
1436
}
1437
}
1438
case StartTagToken:
1439
switch p.tok.DataAtom {
1440
case a.Caption:
1441
p.clearStackToContext(tableScope)
1442
p.afe = append(p.afe, &scopeMarker)
1443
p.addElement()
1444
p.im = inCaptionIM
1445
return true
1446
case a.Colgroup:
1447
p.clearStackToContext(tableScope)
1448
p.addElement()
1449
p.im = inColumnGroupIM
1450
return true
1451
case a.Col:
1452
p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
1453
return false
1454
case a.Tbody, a.Tfoot, a.Thead:
1455
p.clearStackToContext(tableScope)
1456
p.addElement()
1457
p.im = inTableBodyIM
1458
return true
1459
case a.Td, a.Th, a.Tr:
1460
p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
1461
return false
1462
case a.Table:
1463
if p.popUntil(tableScope, a.Table) {
1464
p.resetInsertionMode()
1465
return false
1466
}
1467
// Ignore the token.
1468
return true
1469
case a.Style, a.Script, a.Template:
1470
return inHeadIM(p)
1471
case a.Input:
1472
for _, t := range p.tok.Attr {
1473
if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {
1474
p.addElement()
1475
p.oe.pop()
1476
return true
1477
}
1478
}
1479
// Otherwise drop down to the default action.
1480
case a.Form:
1481
if p.oe.contains(a.Template) || p.form != nil {
1482
// Ignore the token.
1483
return true
1484
}
1485
p.addElement()
1486
p.form = p.oe.pop()
1487
case a.Select:
1488
p.reconstructActiveFormattingElements()
1489
switch p.top().DataAtom {
1490
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1491
p.fosterParenting = true
1492
}
1493
p.addElement()
1494
p.fosterParenting = false
1495
p.framesetOK = false
1496
p.im = inSelectInTableIM
1497
return true
1498
}
1499
case EndTagToken:
1500
switch p.tok.DataAtom {
1501
case a.Table:
1502
if p.popUntil(tableScope, a.Table) {
1503
p.resetInsertionMode()
1504
return true
1505
}
1506
// Ignore the token.
1507
return true
1508
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1509
// Ignore the token.
1510
return true
1511
case a.Template:
1512
return inHeadIM(p)
1513
}
1514
case CommentToken:
1515
p.addChild(&Node{
1516
Type: CommentNode,
1517
Data: p.tok.Data,
1518
})
1519
return true
1520
case DoctypeToken:
1521
// Ignore the token.
1522
return true
1523
case ErrorToken:
1524
return inBodyIM(p)
1525
}
1526
1527
p.fosterParenting = true
1528
defer func() { p.fosterParenting = false }()
1529
1530
return inBodyIM(p)
1531
}
1532
1533
// Section 12.2.6.4.11.
1534
func inCaptionIM(p *parser) bool {
1535
switch p.tok.Type {
1536
case StartTagToken:
1537
switch p.tok.DataAtom {
1538
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
1539
if !p.popUntil(tableScope, a.Caption) {
1540
// Ignore the token.
1541
return true
1542
}
1543
p.clearActiveFormattingElements()
1544
p.im = inTableIM
1545
return false
1546
case a.Select:
1547
p.reconstructActiveFormattingElements()
1548
p.addElement()
1549
p.framesetOK = false
1550
p.im = inSelectInTableIM
1551
return true
1552
}
1553
case EndTagToken:
1554
switch p.tok.DataAtom {
1555
case a.Caption:
1556
if p.popUntil(tableScope, a.Caption) {
1557
p.clearActiveFormattingElements()
1558
p.im = inTableIM
1559
}
1560
return true
1561
case a.Table:
1562
if !p.popUntil(tableScope, a.Caption) {
1563
// Ignore the token.
1564
return true
1565
}
1566
p.clearActiveFormattingElements()
1567
p.im = inTableIM
1568
return false
1569
case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1570
// Ignore the token.
1571
return true
1572
}
1573
}
1574
return inBodyIM(p)
1575
}
1576
1577
// Section 12.2.6.4.12.
1578
func inColumnGroupIM(p *parser) bool {
1579
switch p.tok.Type {
1580
case TextToken:
1581
s := strings.TrimLeft(p.tok.Data, whitespace)
1582
if len(s) < len(p.tok.Data) {
1583
// Add the initial whitespace to the current node.
1584
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
1585
if s == "" {
1586
return true
1587
}
1588
p.tok.Data = s
1589
}
1590
case CommentToken:
1591
p.addChild(&Node{
1592
Type: CommentNode,
1593
Data: p.tok.Data,
1594
})
1595
return true
1596
case DoctypeToken:
1597
// Ignore the token.
1598
return true
1599
case StartTagToken:
1600
switch p.tok.DataAtom {
1601
case a.Html:
1602
return inBodyIM(p)
1603
case a.Col:
1604
p.addElement()
1605
p.oe.pop()
1606
p.acknowledgeSelfClosingTag()
1607
return true
1608
case a.Template:
1609
return inHeadIM(p)
1610
}
1611
case EndTagToken:
1612
switch p.tok.DataAtom {
1613
case a.Colgroup:
1614
if p.oe.top().DataAtom == a.Colgroup {
1615
p.oe.pop()
1616
p.im = inTableIM
1617
}
1618
return true
1619
case a.Col:
1620
// Ignore the token.
1621
return true
1622
case a.Template:
1623
return inHeadIM(p)
1624
}
1625
case ErrorToken:
1626
return inBodyIM(p)
1627
}
1628
if p.oe.top().DataAtom != a.Colgroup {
1629
return true
1630
}
1631
p.oe.pop()
1632
p.im = inTableIM
1633
return false
1634
}
1635
1636
// Section 12.2.6.4.13.
1637
func inTableBodyIM(p *parser) bool {
1638
switch p.tok.Type {
1639
case StartTagToken:
1640
switch p.tok.DataAtom {
1641
case a.Tr:
1642
p.clearStackToContext(tableBodyScope)
1643
p.addElement()
1644
p.im = inRowIM
1645
return true
1646
case a.Td, a.Th:
1647
p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
1648
return false
1649
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1650
if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1651
p.im = inTableIM
1652
return false
1653
}
1654
// Ignore the token.
1655
return true
1656
}
1657
case EndTagToken:
1658
switch p.tok.DataAtom {
1659
case a.Tbody, a.Tfoot, a.Thead:
1660
if p.elementInScope(tableScope, p.tok.DataAtom) {
1661
p.clearStackToContext(tableBodyScope)
1662
p.oe.pop()
1663
p.im = inTableIM
1664
}
1665
return true
1666
case a.Table:
1667
if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
1668
p.im = inTableIM
1669
return false
1670
}
1671
// Ignore the token.
1672
return true
1673
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
1674
// Ignore the token.
1675
return true
1676
}
1677
case CommentToken:
1678
p.addChild(&Node{
1679
Type: CommentNode,
1680
Data: p.tok.Data,
1681
})
1682
return true
1683
}
1684
1685
return inTableIM(p)
1686
}
1687
1688
// Section 13.2.6.4.14.
1689
func inRowIM(p *parser) bool {
1690
switch p.tok.Type {
1691
case StartTagToken:
1692
switch p.tok.DataAtom {
1693
case a.Td, a.Th:
1694
p.clearStackToContext(tableRowScope)
1695
p.addElement()
1696
p.afe = append(p.afe, &scopeMarker)
1697
p.im = inCellIM
1698
return true
1699
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1700
if p.elementInScope(tableScope, a.Tr) {
1701
p.clearStackToContext(tableRowScope)
1702
p.oe.pop()
1703
p.im = inTableBodyIM
1704
return false
1705
}
1706
// Ignore the token.
1707
return true
1708
}
1709
case EndTagToken:
1710
switch p.tok.DataAtom {
1711
case a.Tr:
1712
if p.elementInScope(tableScope, a.Tr) {
1713
p.clearStackToContext(tableRowScope)
1714
p.oe.pop()
1715
p.im = inTableBodyIM
1716
return true
1717
}
1718
// Ignore the token.
1719
return true
1720
case a.Table:
1721
if p.elementInScope(tableScope, a.Tr) {
1722
p.clearStackToContext(tableRowScope)
1723
p.oe.pop()
1724
p.im = inTableBodyIM
1725
return false
1726
}
1727
// Ignore the token.
1728
return true
1729
case a.Tbody, a.Tfoot, a.Thead:
1730
if p.elementInScope(tableScope, p.tok.DataAtom) && p.elementInScope(tableScope, a.Tr) {
1731
p.clearStackToContext(tableRowScope)
1732
p.oe.pop()
1733
p.im = inTableBodyIM
1734
return false
1735
}
1736
// Ignore the token.
1737
return true
1738
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
1739
// Ignore the token.
1740
return true
1741
}
1742
}
1743
1744
return inTableIM(p)
1745
}
1746
1747
// Section 12.2.6.4.15.
1748
func inCellIM(p *parser) bool {
1749
switch p.tok.Type {
1750
case StartTagToken:
1751
switch p.tok.DataAtom {
1752
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
1753
if p.popUntil(tableScope, a.Td, a.Th) {
1754
// Close the cell and reprocess.
1755
p.clearActiveFormattingElements()
1756
p.im = inRowIM
1757
return false
1758
}
1759
// Ignore the token.
1760
return true
1761
case a.Select:
1762
p.reconstructActiveFormattingElements()
1763
p.addElement()
1764
p.framesetOK = false
1765
p.im = inSelectInTableIM
1766
return true
1767
}
1768
case EndTagToken:
1769
switch p.tok.DataAtom {
1770
case a.Td, a.Th:
1771
if !p.popUntil(tableScope, p.tok.DataAtom) {
1772
// Ignore the token.
1773
return true
1774
}
1775
p.clearActiveFormattingElements()
1776
p.im = inRowIM
1777
return true
1778
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
1779
// Ignore the token.
1780
return true
1781
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
1782
if !p.elementInScope(tableScope, p.tok.DataAtom) {
1783
// Ignore the token.
1784
return true
1785
}
1786
// Close the cell and reprocess.
1787
if p.popUntil(tableScope, a.Td, a.Th) {
1788
p.clearActiveFormattingElements()
1789
}
1790
p.im = inRowIM
1791
return false
1792
}
1793
}
1794
return inBodyIM(p)
1795
}
1796
1797
// Section 12.2.6.4.16.
1798
func inSelectIM(p *parser) bool {
1799
switch p.tok.Type {
1800
case TextToken:
1801
p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
1802
case StartTagToken:
1803
switch p.tok.DataAtom {
1804
case a.Html:
1805
return inBodyIM(p)
1806
case a.Option:
1807
if p.top().DataAtom == a.Option {
1808
p.oe.pop()
1809
}
1810
p.addElement()
1811
case a.Optgroup:
1812
if p.top().DataAtom == a.Option {
1813
p.oe.pop()
1814
}
1815
if p.top().DataAtom == a.Optgroup {
1816
p.oe.pop()
1817
}
1818
p.addElement()
1819
case a.Select:
1820
if !p.popUntil(selectScope, a.Select) {
1821
// Ignore the token.
1822
return true
1823
}
1824
p.resetInsertionMode()
1825
case a.Input, a.Keygen, a.Textarea:
1826
if p.elementInScope(selectScope, a.Select) {
1827
p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
1828
return false
1829
}
1830
// In order to properly ignore <textarea>, we need to change the tokenizer mode.
1831
p.tokenizer.NextIsNotRawText()
1832
// Ignore the token.
1833
return true
1834
case a.Script, a.Template:
1835
return inHeadIM(p)
1836
case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
1837
// Don't let the tokenizer go into raw text mode when there are raw tags
1838
// to be ignored. These tags should be ignored from the tokenizer
1839
// properly.
1840
p.tokenizer.NextIsNotRawText()
1841
// Ignore the token.
1842
return true
1843
}
1844
case EndTagToken:
1845
switch p.tok.DataAtom {
1846
case a.Option:
1847
if p.top().DataAtom == a.Option {
1848
p.oe.pop()
1849
}
1850
case a.Optgroup:
1851
i := len(p.oe) - 1
1852
if p.oe[i].DataAtom == a.Option {
1853
i--
1854
}
1855
if p.oe[i].DataAtom == a.Optgroup {
1856
p.oe = p.oe[:i]
1857
}
1858
case a.Select:
1859
if !p.popUntil(selectScope, a.Select) {
1860
// Ignore the token.
1861
return true
1862
}
1863
p.resetInsertionMode()
1864
case a.Template:
1865
return inHeadIM(p)
1866
}
1867
case CommentToken:
1868
p.addChild(&Node{
1869
Type: CommentNode,
1870
Data: p.tok.Data,
1871
})
1872
case DoctypeToken:
1873
// Ignore the token.
1874
return true
1875
case ErrorToken:
1876
return inBodyIM(p)
1877
}
1878
1879
return true
1880
}
1881
1882
// Section 12.2.6.4.17.
1883
func inSelectInTableIM(p *parser) bool {
1884
switch p.tok.Type {
1885
case StartTagToken, EndTagToken:
1886
switch p.tok.DataAtom {
1887
case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
1888
if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
1889
// Ignore the token.
1890
return true
1891
}
1892
// This is like p.popUntil(selectScope, a.Select), but it also
1893
// matches <math select>, not just <select>. Matching the MathML
1894
// tag is arguably incorrect (conceptually), but it mimics what
1895
// Chromium does.
1896
for i := len(p.oe) - 1; i >= 0; i-- {
1897
if n := p.oe[i]; n.DataAtom == a.Select {
1898
p.oe = p.oe[:i]
1899
break
1900
}
1901
}
1902
p.resetInsertionMode()
1903
return false
1904
}
1905
}
1906
return inSelectIM(p)
1907
}
1908
1909
// Section 12.2.6.4.18.
1910
func inTemplateIM(p *parser) bool {
1911
switch p.tok.Type {
1912
case TextToken, CommentToken, DoctypeToken:
1913
return inBodyIM(p)
1914
case StartTagToken:
1915
switch p.tok.DataAtom {
1916
case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
1917
return inHeadIM(p)
1918
case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
1919
p.templateStack.pop()
1920
p.templateStack = append(p.templateStack, inTableIM)
1921
p.im = inTableIM
1922
return false
1923
case a.Col:
1924
p.templateStack.pop()
1925
p.templateStack = append(p.templateStack, inColumnGroupIM)
1926
p.im = inColumnGroupIM
1927
return false
1928
case a.Tr:
1929
p.templateStack.pop()
1930
p.templateStack = append(p.templateStack, inTableBodyIM)
1931
p.im = inTableBodyIM
1932
return false
1933
case a.Td, a.Th:
1934
p.templateStack.pop()
1935
p.templateStack = append(p.templateStack, inRowIM)
1936
p.im = inRowIM
1937
return false
1938
default:
1939
p.templateStack.pop()
1940
p.templateStack = append(p.templateStack, inBodyIM)
1941
p.im = inBodyIM
1942
return false
1943
}
1944
case EndTagToken:
1945
switch p.tok.DataAtom {
1946
case a.Template:
1947
return inHeadIM(p)
1948
default:
1949
// Ignore the token.
1950
return true
1951
}
1952
case ErrorToken:
1953
if !p.oe.contains(a.Template) {
1954
// Ignore the token.
1955
return true
1956
}
1957
// TODO: remove this divergence from the HTML5 spec.
1958
//
1959
// See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
1960
p.generateImpliedEndTags()
1961
for i := len(p.oe) - 1; i >= 0; i-- {
1962
if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
1963
p.oe = p.oe[:i]
1964
break
1965
}
1966
}
1967
p.clearActiveFormattingElements()
1968
p.templateStack.pop()
1969
p.resetInsertionMode()
1970
return false
1971
}
1972
return false
1973
}
1974
1975
// Section 12.2.6.4.19.
1976
func afterBodyIM(p *parser) bool {
1977
switch p.tok.Type {
1978
case ErrorToken:
1979
// Stop parsing.
1980
return true
1981
case TextToken:
1982
s := strings.TrimLeft(p.tok.Data, whitespace)
1983
if len(s) == 0 {
1984
// It was all whitespace.
1985
return inBodyIM(p)
1986
}
1987
case StartTagToken:
1988
if p.tok.DataAtom == a.Html {
1989
return inBodyIM(p)
1990
}
1991
case EndTagToken:
1992
if p.tok.DataAtom == a.Html {
1993
if !p.fragment {
1994
p.im = afterAfterBodyIM
1995
}
1996
return true
1997
}
1998
case CommentToken:
1999
// The comment is attached to the <html> element.
2000
if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
2001
panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
2002
}
2003
p.oe[0].AppendChild(&Node{
2004
Type: CommentNode,
2005
Data: p.tok.Data,
2006
})
2007
return true
2008
}
2009
p.im = inBodyIM
2010
return false
2011
}
2012
2013
// Section 12.2.6.4.20.
2014
func inFramesetIM(p *parser) bool {
2015
switch p.tok.Type {
2016
case CommentToken:
2017
p.addChild(&Node{
2018
Type: CommentNode,
2019
Data: p.tok.Data,
2020
})
2021
case TextToken:
2022
// Ignore all text but whitespace.
2023
s := strings.Map(func(c rune) rune {
2024
switch c {
2025
case ' ', '\t', '\n', '\f', '\r':
2026
return c
2027
}
2028
return -1
2029
}, p.tok.Data)
2030
if s != "" {
2031
p.addText(s)
2032
}
2033
case StartTagToken:
2034
switch p.tok.DataAtom {
2035
case a.Html:
2036
return inBodyIM(p)
2037
case a.Frameset:
2038
p.addElement()
2039
case a.Frame:
2040
p.addElement()
2041
p.oe.pop()
2042
p.acknowledgeSelfClosingTag()
2043
case a.Noframes:
2044
return inHeadIM(p)
2045
}
2046
case EndTagToken:
2047
switch p.tok.DataAtom {
2048
case a.Frameset:
2049
if p.oe.top().DataAtom != a.Html {
2050
p.oe.pop()
2051
if p.oe.top().DataAtom != a.Frameset {
2052
p.im = afterFramesetIM
2053
return true
2054
}
2055
}
2056
}
2057
default:
2058
// Ignore the token.
2059
}
2060
return true
2061
}
2062
2063
// Section 12.2.6.4.21.
2064
func afterFramesetIM(p *parser) bool {
2065
switch p.tok.Type {
2066
case CommentToken:
2067
p.addChild(&Node{
2068
Type: CommentNode,
2069
Data: p.tok.Data,
2070
})
2071
case TextToken:
2072
// Ignore all text but whitespace.
2073
s := strings.Map(func(c rune) rune {
2074
switch c {
2075
case ' ', '\t', '\n', '\f', '\r':
2076
return c
2077
}
2078
return -1
2079
}, p.tok.Data)
2080
if s != "" {
2081
p.addText(s)
2082
}
2083
case StartTagToken:
2084
switch p.tok.DataAtom {
2085
case a.Html:
2086
return inBodyIM(p)
2087
case a.Noframes:
2088
return inHeadIM(p)
2089
}
2090
case EndTagToken:
2091
switch p.tok.DataAtom {
2092
case a.Html:
2093
p.im = afterAfterFramesetIM
2094
return true
2095
}
2096
default:
2097
// Ignore the token.
2098
}
2099
return true
2100
}
2101
2102
// Section 12.2.6.4.22.
2103
func afterAfterBodyIM(p *parser) bool {
2104
switch p.tok.Type {
2105
case ErrorToken:
2106
// Stop parsing.
2107
return true
2108
case TextToken:
2109
s := strings.TrimLeft(p.tok.Data, whitespace)
2110
if len(s) == 0 {
2111
// It was all whitespace.
2112
return inBodyIM(p)
2113
}
2114
case StartTagToken:
2115
if p.tok.DataAtom == a.Html {
2116
return inBodyIM(p)
2117
}
2118
case CommentToken:
2119
p.doc.AppendChild(&Node{
2120
Type: CommentNode,
2121
Data: p.tok.Data,
2122
})
2123
return true
2124
case DoctypeToken:
2125
return inBodyIM(p)
2126
}
2127
p.im = inBodyIM
2128
return false
2129
}
2130
2131
// Section 12.2.6.4.23.
2132
func afterAfterFramesetIM(p *parser) bool {
2133
switch p.tok.Type {
2134
case CommentToken:
2135
p.doc.AppendChild(&Node{
2136
Type: CommentNode,
2137
Data: p.tok.Data,
2138
})
2139
case TextToken:
2140
// Ignore all text but whitespace.
2141
s := strings.Map(func(c rune) rune {
2142
switch c {
2143
case ' ', '\t', '\n', '\f', '\r':
2144
return c
2145
}
2146
return -1
2147
}, p.tok.Data)
2148
if s != "" {
2149
p.tok.Data = s
2150
return inBodyIM(p)
2151
}
2152
case StartTagToken:
2153
switch p.tok.DataAtom {
2154
case a.Html:
2155
return inBodyIM(p)
2156
case a.Noframes:
2157
return inHeadIM(p)
2158
}
2159
case DoctypeToken:
2160
return inBodyIM(p)
2161
default:
2162
// Ignore the token.
2163
}
2164
return true
2165
}
2166
2167
func ignoreTheRemainingTokens(p *parser) bool {
2168
return true
2169
}
2170
2171
const whitespaceOrNUL = whitespace + "\x00"
2172
2173
// Section 12.2.6.5
2174
func parseForeignContent(p *parser) bool {
2175
switch p.tok.Type {
2176
case TextToken:
2177
if p.framesetOK {
2178
p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
2179
}
2180
p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
2181
p.addText(p.tok.Data)
2182
case CommentToken:
2183
p.addChild(&Node{
2184
Type: CommentNode,
2185
Data: p.tok.Data,
2186
})
2187
case StartTagToken:
2188
if !p.fragment {
2189
b := breakout[p.tok.Data]
2190
if p.tok.DataAtom == a.Font {
2191
loop:
2192
for _, attr := range p.tok.Attr {
2193
switch attr.Key {
2194
case "color", "face", "size":
2195
b = true
2196
break loop
2197
}
2198
}
2199
}
2200
if b {
2201
for i := len(p.oe) - 1; i >= 0; i-- {
2202
n := p.oe[i]
2203
if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
2204
p.oe = p.oe[:i+1]
2205
break
2206
}
2207
}
2208
return false
2209
}
2210
}
2211
current := p.adjustedCurrentNode()
2212
switch current.Namespace {
2213
case "math":
2214
adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
2215
case "svg":
2216
// Adjust SVG tag names. The tokenizer lower-cases tag names, but
2217
// SVG wants e.g. "foreignObject" with a capital second "O".
2218
if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
2219
p.tok.DataAtom = a.Lookup([]byte(x))
2220
p.tok.Data = x
2221
}
2222
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
2223
default:
2224
panic("html: bad parser state: unexpected namespace")
2225
}
2226
adjustForeignAttributes(p.tok.Attr)
2227
namespace := current.Namespace
2228
p.addElement()
2229
p.top().Namespace = namespace
2230
if namespace != "" {
2231
// Don't let the tokenizer go into raw text mode in foreign content
2232
// (e.g. in an SVG <title> tag).
2233
p.tokenizer.NextIsNotRawText()
2234
}
2235
if p.hasSelfClosingToken {
2236
p.oe.pop()
2237
p.acknowledgeSelfClosingTag()
2238
}
2239
case EndTagToken:
2240
if strings.EqualFold(p.oe[len(p.oe)-1].Data, p.tok.Data) {
2241
p.oe = p.oe[:len(p.oe)-1]
2242
return true
2243
}
2244
for i := len(p.oe) - 1; i >= 0; i-- {
2245
if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
2246
p.oe = p.oe[:i]
2247
return true
2248
}
2249
if i > 0 && p.oe[i-1].Namespace == "" {
2250
break
2251
}
2252
}
2253
return p.im(p)
2254
default:
2255
// Ignore the token.
2256
}
2257
return true
2258
}
2259
2260
// Section 12.2.4.2.
2261
func (p *parser) adjustedCurrentNode() *Node {
2262
if len(p.oe) == 1 && p.fragment && p.context != nil {
2263
return p.context
2264
}
2265
return p.oe.top()
2266
}
2267
2268
// Section 12.2.6.
2269
func (p *parser) inForeignContent() bool {
2270
if len(p.oe) == 0 {
2271
return false
2272
}
2273
n := p.adjustedCurrentNode()
2274
if n.Namespace == "" {
2275
return false
2276
}
2277
if mathMLTextIntegrationPoint(n) {
2278
if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
2279
return false
2280
}
2281
if p.tok.Type == TextToken {
2282
return false
2283
}
2284
}
2285
if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
2286
return false
2287
}
2288
if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
2289
return false
2290
}
2291
if p.tok.Type == ErrorToken {
2292
return false
2293
}
2294
return true
2295
}
2296
2297
// parseImpliedToken parses a token as though it had appeared in the parser's
2298
// input.
2299
func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
2300
realToken, selfClosing := p.tok, p.hasSelfClosingToken
2301
p.tok = Token{
2302
Type: t,
2303
DataAtom: dataAtom,
2304
Data: data,
2305
}
2306
p.hasSelfClosingToken = false
2307
p.parseCurrentToken()
2308
p.tok, p.hasSelfClosingToken = realToken, selfClosing
2309
}
2310
2311
// parseCurrentToken runs the current token through the parsing routines
2312
// until it is consumed.
2313
func (p *parser) parseCurrentToken() {
2314
if p.tok.Type == SelfClosingTagToken {
2315
p.hasSelfClosingToken = true
2316
p.tok.Type = StartTagToken
2317
}
2318
2319
consumed := false
2320
for !consumed {
2321
if p.inForeignContent() {
2322
consumed = parseForeignContent(p)
2323
} else {
2324
consumed = p.im(p)
2325
}
2326
}
2327
2328
if p.hasSelfClosingToken {
2329
// This is a parse error, but ignore it.
2330
p.hasSelfClosingToken = false
2331
}
2332
}
2333
2334
func (p *parser) parse() (err error) {
2335
defer func() {
2336
if panicErr := recover(); panicErr != nil {
2337
err = fmt.Errorf("%s", panicErr)
2338
}
2339
}()
2340
// Iterate until EOF. Any other error will cause an early return.
2341
for err != io.EOF {
2342
// CDATA sections are allowed only in foreign content.
2343
n := p.oe.top()
2344
p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
2345
// Read and parse the next token.
2346
p.tokenizer.Next()
2347
p.tok = p.tokenizer.Token()
2348
if p.tok.Type == ErrorToken {
2349
err = p.tokenizer.Err()
2350
if err != nil && err != io.EOF {
2351
return err
2352
}
2353
}
2354
p.parseCurrentToken()
2355
}
2356
return nil
2357
}
2358
2359
// Parse returns the parse tree for the HTML from the given Reader.
2360
//
2361
// It implements the HTML5 parsing algorithm
2362
// (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
2363
// which is very complicated. The resultant tree can contain implicitly created
2364
// nodes that have no explicit <tag> listed in r's data, and nodes' parents can
2365
// differ from the nesting implied by a naive processing of start and end
2366
// <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
2367
// with no corresponding node in the resulting tree.
2368
//
2369
// Parse will reject HTML that is nested deeper than 512 elements.
2370
//
2371
// The input is assumed to be UTF-8 encoded.
2372
func Parse(r io.Reader) (*Node, error) {
2373
return ParseWithOptions(r)
2374
}
2375
2376
// ParseFragment parses a fragment of HTML and returns the nodes that were
2377
// found. If the fragment is the InnerHTML for an existing element, pass that
2378
// element in context.
2379
//
2380
// It has the same intricacies as Parse.
2381
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
2382
return ParseFragmentWithOptions(r, context)
2383
}
2384
2385
// ParseOption configures a parser.
2386
type ParseOption func(p *parser)
2387
2388
// ParseOptionEnableScripting configures the scripting flag.
2389
// https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
2390
//
2391
// By default, scripting is enabled.
2392
func ParseOptionEnableScripting(enable bool) ParseOption {
2393
return func(p *parser) {
2394
p.scripting = enable
2395
}
2396
}
2397
2398
// ParseWithOptions is like Parse, with options.
2399
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
2400
p := &parser{
2401
tokenizer: NewTokenizer(r),
2402
doc: &Node{
2403
Type: DocumentNode,
2404
},
2405
scripting: true,
2406
framesetOK: true,
2407
im: initialIM,
2408
}
2409
2410
for _, f := range opts {
2411
f(p)
2412
}
2413
2414
if err := p.parse(); err != nil {
2415
return nil, err
2416
}
2417
return p.doc, nil
2418
}
2419
2420
// ParseFragmentWithOptions is like ParseFragment, with options.
2421
func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
2422
contextTag := ""
2423
if context != nil {
2424
if context.Type != ElementNode {
2425
return nil, errors.New("html: ParseFragment of non-element Node")
2426
}
2427
// The next check isn't just context.DataAtom.String() == context.Data because
2428
// it is valid to pass an element whose tag isn't a known atom. For example,
2429
// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
2430
if context.DataAtom != a.Lookup([]byte(context.Data)) {
2431
return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
2432
}
2433
contextTag = context.DataAtom.String()
2434
}
2435
p := &parser{
2436
doc: &Node{
2437
Type: DocumentNode,
2438
},
2439
scripting: true,
2440
fragment: true,
2441
context: context,
2442
}
2443
if context != nil && context.Namespace != "" {
2444
p.tokenizer = NewTokenizer(r)
2445
} else {
2446
p.tokenizer = NewTokenizerFragment(r, contextTag)
2447
}
2448
2449
for _, f := range opts {
2450
f(p)
2451
}
2452
2453
root := &Node{
2454
Type: ElementNode,
2455
DataAtom: a.Html,
2456
Data: a.Html.String(),
2457
}
2458
p.doc.AppendChild(root)
2459
p.oe = nodeStack{root}
2460
if context != nil && context.DataAtom == a.Template {
2461
p.templateStack = append(p.templateStack, inTemplateIM)
2462
}
2463
p.resetInsertionMode()
2464
2465
for n := context; n != nil; n = n.Parent {
2466
if n.Type == ElementNode && n.DataAtom == a.Form {
2467
p.form = n
2468
break
2469
}
2470
}
2471
2472
if err := p.parse(); err != nil {
2473
return nil, err
2474
}
2475
2476
parent := p.doc
2477
if context != nil {
2478
parent = root
2479
}
2480
2481
var result []*Node
2482
for c := parent.FirstChild; c != nil; {
2483
next := c.NextSibling
2484
parent.RemoveChild(c)
2485
result = append(result, c)
2486
c = next
2487
}
2488
return result, nil
2489
}
2490
2491