Commit a5ff8ad9 authored by Nigel Tao's avatar Nigel Tao

html: tokenize HTML comments.

I'm not sure if it's 100% correct wrt the HTML5 specification,
but the test suite has plenty of HTML comment test cases, and
we'll shake out any tokenization bugs as the parser improves its
coverage.

R=gri
CC=golang-dev
https://golang.org/cl/4186055
parent 3a2d6478
...@@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text: ...@@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text:
} }
} }
A Tokenizer typically skips over HTML comments. To return comment tokens, set
Tokenizer.ReturnComments to true before looping over calls to Next.
Parsing is done by calling Parse with an io.Reader, which returns the root of Parsing is done by calling Parse with an io.Reader, which returns the root of
the parse tree (the document element) as a *Node. It is the caller's the parse tree (the document element) as a *Node. It is the caller's
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
......
...@@ -25,6 +25,8 @@ const ( ...@@ -25,6 +25,8 @@ const (
EndTagToken EndTagToken
// A SelfClosingTagToken tag looks like <br/>. // A SelfClosingTagToken tag looks like <br/>.
SelfClosingTagToken SelfClosingTagToken
// A CommentToken looks like <!--x-->.
CommentToken
) )
// String returns a string representation of the TokenType. // String returns a string representation of the TokenType.
...@@ -40,6 +42,8 @@ func (t TokenType) String() string { ...@@ -40,6 +42,8 @@ func (t TokenType) String() string {
return "EndTag" return "EndTag"
case SelfClosingTagToken: case SelfClosingTagToken:
return "SelfClosingTag" return "SelfClosingTag"
case CommentToken:
return "Comment"
} }
return "Invalid(" + strconv.Itoa(int(t)) + ")" return "Invalid(" + strconv.Itoa(int(t)) + ")"
} }
...@@ -52,8 +56,8 @@ type Attribute struct { ...@@ -52,8 +56,8 @@ type Attribute struct {
} }
// A Token consists of a TokenType and some Data (tag name for start and end // A Token consists of a TokenType and some Data (tag name for start and end
// tags, content for text). A tag Token may also contain a slice of Attributes. // tags, content for text and comments). A tag Token may also contain a slice
// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather // of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather
// than "a&lt;b"). // than "a&lt;b").
type Token struct { type Token struct {
Type TokenType Type TokenType
...@@ -91,12 +95,18 @@ func (t Token) String() string { ...@@ -91,12 +95,18 @@ func (t Token) String() string {
return "</" + t.tagString() + ">" return "</" + t.tagString() + ">"
case SelfClosingTagToken: case SelfClosingTagToken:
return "<" + t.tagString() + "/>" return "<" + t.tagString() + "/>"
case CommentToken:
return "<!--" + EscapeString(t.Data) + "-->"
} }
return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
} }
// A Tokenizer returns a stream of HTML Tokens. // A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct { type Tokenizer struct {
// If ReturnComments is set, Next returns comment tokens;
// otherwise it skips over comments (default).
ReturnComments bool
// r is the source of the HTML text. // r is the source of the HTML text.
r io.Reader r io.Reader
// tt is the TokenType of the most recently read token. If tt == Error // tt is the TokenType of the most recently read token. If tt == Error
...@@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error { ...@@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
panic("unreachable") panic("unreachable")
} }
// nextMarkupDeclaration returns the next TokenType starting with "<!".
func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) {
// TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment.
for i := 0; i < 2; i++ {
c, err := z.readByte()
if err != nil {
return TextToken, err
}
if c != '-' {
return z.nextText(), nil
}
}
// <!--> is a valid comment.
for dashCount := 2; ; {
c, err := z.readByte()
if err != nil {
return TextToken, err
}
switch c {
case '-':
dashCount++
case '>':
if dashCount >= 2 {
return CommentToken, nil
}
fallthrough
default:
dashCount = 0
}
}
panic("unreachable")
}
// nextTag returns the next TokenType starting from the tag open state. // nextTag returns the next TokenType starting from the tag open state.
func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
c, err := z.readByte() c, err := z.readByte()
...@@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { ...@@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
tt = StartTagToken tt = StartTagToken
case c == '!': case c == '!':
return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments") return z.nextMarkupDeclaration()
case c == '?': case c == '?':
return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions") return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
default: default:
...@@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { ...@@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
panic("unreachable") panic("unreachable")
} }
// Next scans the next token and returns its type. // nextText reads all text up until an '<'.
func (z *Tokenizer) Next() TokenType { func (z *Tokenizer) nextText() TokenType {
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
c, err := z.readByte()
if err != nil {
z.tt, z.err = ErrorToken, err
return z.tt
}
if c == '<' {
z.tt, z.err = z.nextTag()
return z.tt
}
for { for {
c, err := z.readByte() c, err := z.readByte()
if err != nil { if err != nil {
...@@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType { ...@@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType {
panic("unreachable") panic("unreachable")
} }
// Next scans the next token and returns its type.
func (z *Tokenizer) Next() TokenType {
for {
if z.err != nil {
z.tt = ErrorToken
return z.tt
}
z.p0 = z.p1
c, err := z.readByte()
if err != nil {
z.tt, z.err = ErrorToken, err
return z.tt
}
if c == '<' {
z.tt, z.err = z.nextTag()
if z.tt == CommentToken && !z.ReturnComments {
continue
}
return z.tt
}
return z.nextText()
}
panic("unreachable")
}
// trim returns the largest j such that z.buf[i:j] contains only white space, // trim returns the largest j such that z.buf[i:j] contains only white space,
// or only white space plus the final ">" or "/>" of the raw data. // or only white space plus the final ">" or "/>" of the raw data.
func (z *Tokenizer) trim(i int) int { func (z *Tokenizer) trim(i int) int {
...@@ -299,12 +353,27 @@ loop: ...@@ -299,12 +353,27 @@ loop:
return z.buf[i0:i], z.trim(i) return z.buf[i0:i], z.trim(i)
} }
// Text returns the raw data after unescaping. // Text returns the unescaped text of a TextToken or a CommentToken.
// The contents of the returned slice may change on the next call to Next. // The contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte { func (z *Tokenizer) Text() []byte {
s := unescape(z.Raw()) switch z.tt {
z.p0 = z.p1 case TextToken:
return s s := unescape(z.Raw())
z.p0 = z.p1
return s
case CommentToken:
// We trim the "<!--" from the left and the "-->" from the right.
// "<!-->" is a valid comment, so the adjusted endpoints might overlap.
i0 := z.p0 + 4
i1 := z.p1 - 3
z.p0 = z.p1
var s []byte
if i0 < i1 {
s = unescape(z.buf[i0:i1])
}
return s
}
return nil
} }
// TagName returns the lower-cased name of a tag token (the `img` out of // TagName returns the lower-cased name of a tag token (the `img` out of
...@@ -372,7 +441,7 @@ loop: ...@@ -372,7 +441,7 @@ loop:
func (z *Tokenizer) Token() Token { func (z *Tokenizer) Token() Token {
t := Token{Type: z.tt} t := Token{Type: z.tt}
switch z.tt { switch z.tt {
case TextToken: case TextToken, CommentToken:
t.Data = string(z.Text()) t.Data = string(z.Text())
case StartTagToken, EndTagToken, SelfClosingTagToken: case StartTagToken, EndTagToken, SelfClosingTagToken:
var attr []Attribute var attr []Attribute
......
...@@ -7,6 +7,7 @@ package html ...@@ -7,6 +7,7 @@ package html
import ( import (
"bytes" "bytes"
"os" "os"
"strings"
"testing" "testing"
) )
...@@ -15,8 +16,8 @@ type tokenTest struct { ...@@ -15,8 +16,8 @@ type tokenTest struct {
desc string desc string
// The HTML to parse. // The HTML to parse.
html string html string
// The string representations of the expected tokens. // The string representations of the expected tokens, joined by '$'.
tokens []string golden string
} }
var tokenTests = []tokenTest{ var tokenTests = []tokenTest{
...@@ -25,61 +26,86 @@ var tokenTests = []tokenTest{ ...@@ -25,61 +26,86 @@ var tokenTests = []tokenTest{
{ {
"text", "text",
"foo bar", "foo bar",
[]string{ "foo bar",
"foo bar",
},
}, },
// An entity. // An entity.
{ {
"entity", "entity",
"one &lt; two", "one &lt; two",
[]string{ "one &lt; two",
"one &lt; two",
},
}, },
// A start, self-closing and end tag. The tokenizer does not care if the start // A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser. // and end tokens don't match; that is the job of the parser.
{ {
"tags", "tags",
"<a>b<c/>d</e>", "<a>b<c/>d</e>",
[]string{ "<a>$b$<c/>$d$</e>",
"<a>", },
"b", // Comments.
"<c/>", {
"d", "comment0",
"</e>", "abc<b><!-- skipme --></b>def",
}, "abc$<b>$</b>$def",
},
{
"comment1",
"a<!-->z",
"a$z",
},
{
"comment2",
"a<!--->z",
"a$z",
},
{
"comment3",
"a<!--x>-->z",
"a$z",
},
{
"comment4",
"a<!--x->-->z",
"a$z",
},
{
"comment5",
"a<!>z",
"a$&lt;!&gt;z",
},
{
"comment6",
"a<!->z",
"a$&lt;!-&gt;z",
},
{
"comment7",
"a<!---<>z",
"a$&lt;!---&lt;&gt;z",
},
{
"comment8",
"a<!--z",
"a$&lt;!--z",
}, },
// An attribute with a backslash. // An attribute with a backslash.
{ {
"backslash", "backslash",
`<p id="a\"b">`, `<p id="a\"b">`,
[]string{ `<p id="a&quot;b">`,
`<p id="a&quot;b">`,
},
}, },
// Entities, tag name and attribute key lower-casing, and whitespace // Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag. // normalization within a tag.
{ {
"tricky", "tricky",
"<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>", "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
[]string{ `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
`<p id="a&quot;B" foo="bar">`,
"<em>",
"te&lt;&amp;;xt",
"</em>",
"</p>",
},
}, },
// A non-existant entity. Tokenizing and converting back to a string should // A non-existant entity. Tokenizing and converting back to a string should
// escape the "&" to become "&amp;". // escape the "&" to become "&amp;".
{ {
"noSuchEntity", "noSuchEntity",
`<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`, `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
[]string{ `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
`<a b="c&amp;noSuchEntity;d">`,
"&lt;&amp;alsoDoesntExist;&amp;",
},
}, },
} }
...@@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) { ...@@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) {
loop: loop:
for _, tt := range tokenTests { for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html))) z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
for i, s := range tt.tokens { for i, s := range strings.Split(tt.golden, "$", -1) {
if z.Next() == ErrorToken { if z.Next() == ErrorToken {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error()) t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
continue loop continue loop
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment