From 55f0c8b2cddff16de2bf101ec997bf96813615d4 Mon Sep 17 00:00:00 2001 From: Andrew Balholm <andybalholm@gmail.com> Date: Fri, 27 Jul 2012 09:27:10 +1000 Subject: [PATCH] exp/html: replace NUL bytes in plaintext, raw text, and RCDATA If NUL bytes occur inside certain elements, convert them to U+FFFD replacement character. Pass 1 additional test. R=nigeltao CC=golang-dev https://golang.org/cl/6452047 --- .../exp/html/testlogs/plain-text-unsafe.dat.log | 2 +- src/pkg/exp/html/token.go | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log index 56da0ba88f..65ee908f55 100644 --- a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log +++ b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log @@ -7,7 +7,7 @@ PASS "<html>\x00\n <frameset></frameset>" PASS "<html><select>\x00" PASS "\x00" PASS "<body>\x00" -FAIL "<plaintext>\x00filler\x00text\x00" +PASS "<plaintext>\x00filler\x00text\x00" FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>" FAIL "<body><!\x00>" FAIL "<body><!\x00filler\x00text>" diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go index b20de87bee..3dc317ebb7 100644 --- a/src/pkg/exp/html/token.go +++ b/src/pkg/exp/html/token.go @@ -152,6 +152,9 @@ type Tokenizer struct { rawTag string // textIsRaw is whether the current text token's data is not escaped. textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool } // Err returns the error associated with the most recent ErrorToken token. @@ -597,16 +600,19 @@ func (z *Tokenizer) Next() TokenType { for z.err == nil { z.readByte() } + z.data.end = z.raw.end z.textIsRaw = true } else { z.readRawOrRCDATA() } if z.data.end > z.data.start { z.tt = TextToken + z.convertNUL = true return z.tt } } z.textIsRaw = false + z.convertNUL = false loop: for { @@ -731,6 +737,11 @@ func convertNewlines(s []byte) []byte { return s } +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + // Text returns the unescaped text of a text, comment or doctype token. The // contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { @@ -740,6 +751,9 @@ func (z *Tokenizer) Text() []byte { z.data.start = z.raw.end z.data.end = z.raw.end s = convertNewlines(s) + if z.convertNUL && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } if !z.textIsRaw { s = unescape(s, false) } -- 2.30.9