parse_test.go 5.28 KB
Newer Older
Nigel Tao's avatar
Nigel Tao committed
1 2 3 4 5 6 7 8 9
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package html

import (
	"bufio"
	"bytes"
10
	"errors"
Nigel Tao's avatar
Nigel Tao committed
11 12 13
	"fmt"
	"io"
	"os"
Nigel Tao's avatar
Nigel Tao committed
14
	"strings"
Nigel Tao's avatar
Nigel Tao committed
15 16 17
	"testing"
)

18 19 20
// readParseTest reads a single test case from r.
func readParseTest(r *bufio.Reader) (text, want string, err error) {
	line, err := r.ReadSlice('\n')
Nigel Tao's avatar
Nigel Tao committed
21
	if err != nil {
22
		return "", "", err
Nigel Tao's avatar
Nigel Tao committed
23
	}
24
	var b []byte
Nigel Tao's avatar
Nigel Tao committed
25

26 27 28 29
	// Read the HTML.
	if string(line) != "#data\n" {
		return "", "", fmt.Errorf(`got %q want "#data\n"`, line)
	}
Nigel Tao's avatar
Nigel Tao committed
30
	for {
31
		line, err = r.ReadSlice('\n')
Nigel Tao's avatar
Nigel Tao committed
32
		if err != nil {
33
			return "", "", err
Nigel Tao's avatar
Nigel Tao committed
34
		}
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
		if line[0] == '#' {
			break
		}
		b = append(b, line...)
	}
	text = strings.TrimRight(string(b), "\n")
	b = b[:0]

	// Skip the error list.
	if string(line) != "#errors\n" {
		return "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
	}
	for {
		line, err = r.ReadSlice('\n')
		if err != nil {
			return "", "", err
Nigel Tao's avatar
Nigel Tao committed
51 52
		}
		if line[0] == '#' {
53
			break
Nigel Tao's avatar
Nigel Tao committed
54
		}
55 56 57 58 59 60 61 62 63 64
	}

	// Read the dump of what the parse tree should be.
	if string(line) != "#document\n" {
		return "", "", fmt.Errorf(`got %q want "#document\n"`, line)
	}
	for {
		line, err = r.ReadSlice('\n')
		if err != nil && err != io.EOF {
			return "", "", err
Nigel Tao's avatar
Nigel Tao committed
65
		}
66 67
		if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
			break
Nigel Tao's avatar
Nigel Tao committed
68
		}
69
		b = append(b, line...)
Nigel Tao's avatar
Nigel Tao committed
70
	}
71
	return text, string(b), nil
Nigel Tao's avatar
Nigel Tao committed
72 73
}

74
func dumpIndent(w io.Writer, level int) {
Nigel Tao's avatar
Nigel Tao committed
75 76 77 78
	io.WriteString(w, "| ")
	for i := 0; i < level; i++ {
		io.WriteString(w, "  ")
	}
79 80
}

81
func dumpLevel(w io.Writer, n *Node, level int) error {
82
	dumpIndent(w, level)
Nigel Tao's avatar
Nigel Tao committed
83 84
	switch n.Type {
	case ErrorNode:
85
		return errors.New("unexpected ErrorNode")
Nigel Tao's avatar
Nigel Tao committed
86
	case DocumentNode:
87
		return errors.New("unexpected DocumentNode")
Nigel Tao's avatar
Nigel Tao committed
88
	case ElementNode:
89
		fmt.Fprintf(w, "<%s>", n.Data)
90 91 92 93 94
		for _, a := range n.Attr {
			io.WriteString(w, "\n")
			dumpIndent(w, level+1)
			fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
		}
Nigel Tao's avatar
Nigel Tao committed
95
	case TextNode:
96
		fmt.Fprintf(w, `"%s"`, n.Data)
Nigel Tao's avatar
Nigel Tao committed
97
	case CommentNode:
98
		fmt.Fprintf(w, "<!-- %s -->", n.Data)
99
	case DoctypeNode:
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
		if n.Attr != nil {
			var p, s string
			for _, a := range n.Attr {
				switch a.Key {
				case "public":
					p = a.Val
				case "system":
					s = a.Val
				}
			}
			if p != "" || s != "" {
				fmt.Fprintf(w, ` "%s"`, p)
				fmt.Fprintf(w, ` "%s"`, s)
			}
		}
		io.WriteString(w, ">")
117
	case scopeMarkerNode:
118
		return errors.New("unexpected scopeMarkerNode")
Nigel Tao's avatar
Nigel Tao committed
119
	default:
120
		return errors.New("unknown node type")
Nigel Tao's avatar
Nigel Tao committed
121 122 123 124 125 126 127 128 129 130
	}
	io.WriteString(w, "\n")
	for _, c := range n.Child {
		if err := dumpLevel(w, c, level+1); err != nil {
			return err
		}
	}
	return nil
}

131
func dump(n *Node) (string, error) {
Nigel Tao's avatar
Nigel Tao committed
132 133 134 135
	if n == nil || len(n.Child) == 0 {
		return "", nil
	}
	b := bytes.NewBuffer(nil)
136 137 138 139
	for _, child := range n.Child {
		if err := dumpLevel(b, child, 0); err != nil {
			return "", err
		}
Nigel Tao's avatar
Nigel Tao committed
140 141 142 143 144
	}
	return b.String(), nil
}

func TestParser(t *testing.T) {
145 146 147 148 149 150 151
	testFiles := []struct {
		filename string
		// n is the number of test cases to run from that file.
		// -1 means all test cases.
		n int
	}{
		// TODO(nigeltao): Process all the test cases from all the .dat files.
152
		{"doctype01.dat", -1},
153
		{"tests1.dat", -1},
154
		{"tests2.dat", -1},
155
		{"tests3.dat", 12},
Nigel Tao's avatar
Nigel Tao committed
156
	}
157
	for _, tf := range testFiles {
158 159 160 161 162 163
		f, err := os.Open("testdata/webkit/" + tf.filename)
		if err != nil {
			t.Fatal(err)
		}
		defer f.Close()
		r := bufio.NewReader(f)
164
		for i := 0; i != tf.n; i++ {
165 166
			text, want, err := readParseTest(r)
			if err == io.EOF && tf.n == -1 {
167 168
				break
			}
Nigel Tao's avatar
Nigel Tao committed
169 170 171 172
			if err != nil {
				t.Fatal(err)
			}
			doc, err := Parse(strings.NewReader(text))
Nigel Tao's avatar
Nigel Tao committed
173 174 175
			if err != nil {
				t.Fatal(err)
			}
Nigel Tao's avatar
Nigel Tao committed
176
			got, err := dump(doc)
Nigel Tao's avatar
Nigel Tao committed
177 178 179 180
			if err != nil {
				t.Fatal(err)
			}
			// Compare the parsed tree to the #document section.
181
			if got != want {
182
				t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
183
				continue
Nigel Tao's avatar
Nigel Tao committed
184
			}
185
			if renderTestBlacklist[text] {
186 187
				continue
			}
188
			// Check that rendering and re-parsing results in an identical tree.
Nigel Tao's avatar
Nigel Tao committed
189 190 191 192 193 194 195 196 197 198 199 200 201
			pr, pw := io.Pipe()
			go func() {
				pw.CloseWithError(Render(pw, doc))
			}()
			doc1, err := Parse(pr)
			if err != nil {
				t.Fatal(err)
			}
			got1, err := dump(doc1)
			if err != nil {
				t.Fatal(err)
			}
			if got != got1 {
202
				t.Errorf("%s test #%d %q, got vs got1:\n----\n%s----\n%s----", tf.filename, i, text, got, got1)
203
				continue
Nigel Tao's avatar
Nigel Tao committed
204 205 206 207
			}
		}
	}
}
208 209 210 211 212 213 214 215 216

// Some test input result in parse trees are not 'well-formed' despite
// following the HTML5 recovery algorithms. Rendering and re-parsing such a
// tree will not result in an exact clone of that tree. We blacklist such
// inputs from the render test.
var renderTestBlacklist = map[string]bool{
	// The second <a> will be reparented to the first <table>'s parent. This
	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
217
	// More cases of <a> being reparented:
218
	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
219
	`<a><table><a></table><p><a><div><a>`:                                     true,
220
	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
221 222 223
	// A <plaintext> element is reparented, putting it before a table.
	// A <plaintext> element can't have anything after it in HTML.
	`<table><plaintext><td>`: true,
224
}