Commit ce27b00f authored by Andrew Balholm's avatar Andrew Balholm Committed by Nigel Tao

html: implement fragment parsing algorithm

Pass the tests in tests4.dat.

R=nigeltao
CC=golang-dev
https://golang.org/cl/5447055
parent 595efd0d
...@@ -39,6 +39,9 @@ type parser struct { ...@@ -39,6 +39,9 @@ type parser struct {
fosterParenting bool fosterParenting bool
// quirks is whether the parser is operating in "quirks mode." // quirks is whether the parser is operating in "quirks mode."
quirks bool quirks bool
// context is the context element when parsing an HTML fragment
// (section 11.4).
context *Node
} }
func (p *parser) top() *Node { func (p *parser) top() *Node {
...@@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() { ...@@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() {
func (p *parser) resetInsertionMode() { func (p *parser) resetInsertionMode() {
for i := len(p.oe) - 1; i >= 0; i-- { for i := len(p.oe) - 1; i >= 0; i-- {
n := p.oe[i] n := p.oe[i]
if i == 0 { if i == 0 && p.context != nil {
// TODO: set n to the context element, for HTML fragment parsing. n = p.context
} }
switch n.Data { switch n.Data {
case "select": case "select":
p.im = inSelectIM p.im = inSelectIM
...@@ -1516,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool { ...@@ -1516,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool {
return true return true
} }
// Parse returns the parse tree for the HTML from the given Reader. func (p *parser) parse() error {
// The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
framesetOK: true,
im: initialIM,
}
// Iterate until EOF. Any other error will cause an early return. // Iterate until EOF. Any other error will cause an early return.
consumed := true consumed := true
for { for {
...@@ -1536,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) { ...@@ -1536,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) {
if err == io.EOF { if err == io.EOF {
break break
} }
return nil, err return err
} }
} }
consumed = p.im(p) consumed = p.im(p)
...@@ -1547,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) { ...@@ -1547,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) {
break break
} }
} }
return nil
}
// Parse returns the parse tree for the HTML from the given Reader.
// The input is assumed to be UTF-8 encoded.
func Parse(r io.Reader) (*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
framesetOK: true,
im: initialIM,
}
err := p.parse()
if err != nil {
return nil, err
}
return p.doc, nil return p.doc, nil
} }
// ParseFragment parses a fragment of HTML and returns the nodes that were
// found. If the fragment is the InnerHTML for an existing element, pass that
// element in context.
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
p := &parser{
tokenizer: NewTokenizer(r),
doc: &Node{
Type: DocumentNode,
},
scripting: true,
context: context,
}
if context != nil {
switch context.Data {
case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
p.tokenizer.rawTag = context.Data
}
}
root := &Node{
Type: ElementNode,
Data: "html",
}
p.doc.Add(root)
p.oe = nodeStack{root}
p.resetInsertionMode()
for n := context; n != nil; n = n.Parent {
if n.Type == ElementNode && n.Data == "form" {
p.form = n
break
}
}
err := p.parse()
if err != nil {
return nil, err
}
parent := p.doc
if context != nil {
parent = root
}
result := parent.Child
parent.Child = nil
for _, n := range result {
n.Parent = nil
}
return result, nil
}
...@@ -16,21 +16,21 @@ import ( ...@@ -16,21 +16,21 @@ import (
) )
// readParseTest reads a single test case from r. // readParseTest reads a single test case from r.
func readParseTest(r *bufio.Reader) (text, want string, err error) { func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
line, err := r.ReadSlice('\n') line, err := r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
var b []byte var b []byte
// Read the HTML. // Read the HTML.
if string(line) != "#data\n" { if string(line) != "#data\n" {
return "", "", fmt.Errorf(`got %q want "#data\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
if line[0] == '#' { if line[0] == '#' {
break break
...@@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) { ...@@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) {
// Skip the error list. // Skip the error list.
if string(line) != "#errors\n" { if string(line) != "#errors\n" {
return "", "", fmt.Errorf(`got %q want "#errors\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil { if err != nil {
return "", "", err return "", "", "", err
} }
if line[0] == '#' { if line[0] == '#' {
break break
} }
} }
if string(line) == "#document-fragment\n" {
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
context = strings.TrimSpace(string(line))
line, err = r.ReadSlice('\n')
if err != nil {
return "", "", "", err
}
}
// Read the dump of what the parse tree should be. // Read the dump of what the parse tree should be.
if string(line) != "#document\n" { if string(line) != "#document\n" {
return "", "", fmt.Errorf(`got %q want "#document\n"`, line) return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
} }
for { for {
line, err = r.ReadSlice('\n') line, err = r.ReadSlice('\n')
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
return "", "", err return "", "", "", err
} }
if len(line) == 0 || len(line) == 1 && line[0] == '\n' { if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
break break
} }
b = append(b, line...) b = append(b, line...)
} }
return text, string(b), nil return text, string(b), context, nil
} }
func dumpIndent(w io.Writer, level int) { func dumpIndent(w io.Writer, level int) {
...@@ -153,7 +165,7 @@ func TestParser(t *testing.T) { ...@@ -153,7 +165,7 @@ func TestParser(t *testing.T) {
{"tests1.dat", -1}, {"tests1.dat", -1},
{"tests2.dat", -1}, {"tests2.dat", -1},
{"tests3.dat", -1}, {"tests3.dat", -1},
// tests4.dat is fragment cases. {"tests4.dat", -1},
{"tests5.dat", -1}, {"tests5.dat", -1},
} }
for _, tf := range testFiles { for _, tf := range testFiles {
...@@ -164,17 +176,37 @@ func TestParser(t *testing.T) { ...@@ -164,17 +176,37 @@ func TestParser(t *testing.T) {
defer f.Close() defer f.Close()
r := bufio.NewReader(f) r := bufio.NewReader(f)
for i := 0; i != tf.n; i++ { for i := 0; i != tf.n; i++ {
text, want, err := readParseTest(r) text, want, context, err := readParseTest(r)
if err == io.EOF && tf.n == -1 { if err == io.EOF && tf.n == -1 {
break break
} }
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
doc, err := Parse(strings.NewReader(text))
if err != nil { var doc *Node
t.Fatal(err) if context == "" {
doc, err = Parse(strings.NewReader(text))
if err != nil {
t.Fatal(err)
}
} else {
contextNode := &Node{
Type: ElementNode,
Data: context,
}
nodes, err := ParseFragment(strings.NewReader(text), contextNode)
if err != nil {
t.Fatal(err)
}
doc = &Node{
Type: DocumentNode,
}
for _, n := range nodes {
doc.Add(n)
}
} }
got, err := dump(doc) got, err := dump(doc)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
...@@ -184,7 +216,7 @@ func TestParser(t *testing.T) { ...@@ -184,7 +216,7 @@ func TestParser(t *testing.T) {
t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want) t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
continue continue
} }
if renderTestBlacklist[text] { if renderTestBlacklist[text] || context != "" {
continue continue
} }
// Check that rendering and re-parsing results in an identical tree. // Check that rendering and re-parsing results in an identical tree.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment