From 55f0c8b2cddff16de2bf101ec997bf96813615d4 Mon Sep 17 00:00:00 2001
From: Andrew Balholm <andybalholm@gmail.com>
Date: Fri, 27 Jul 2012 09:27:10 +1000
Subject: [PATCH] exp/html: replace NUL bytes in plaintext, raw text, and
 RCDATA

If NUL bytes occur inside certain elements, convert them to U+FFFD
replacement character.

Pass 1 additional test.

R=nigeltao
CC=golang-dev
https://golang.org/cl/6452047
---
 .../exp/html/testlogs/plain-text-unsafe.dat.log    |  2 +-
 src/pkg/exp/html/token.go                          | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
index 56da0ba88f..65ee908f55 100644
--- a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
+++ b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
@@ -7,7 +7,7 @@ PASS "<html>\x00\n <frameset></frameset>"
 PASS "<html><select>\x00"
 PASS "\x00"
 PASS "<body>\x00"
-FAIL "<plaintext>\x00filler\x00text\x00"
+PASS "<plaintext>\x00filler\x00text\x00"
 FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>"
 FAIL "<body><!\x00>"
 FAIL "<body><!\x00filler\x00text>"
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go
index b20de87bee..3dc317ebb7 100644
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -152,6 +152,9 @@ type Tokenizer struct {
 	rawTag string
 	// textIsRaw is whether the current text token's data is not escaped.
 	textIsRaw bool
+	// convertNUL is whether NUL bytes in the current token's data should
+	// be converted into \ufffd replacement characters.
+	convertNUL bool
 }
 
 // Err returns the error associated with the most recent ErrorToken token.
@@ -597,16 +600,19 @@ func (z *Tokenizer) Next() TokenType {
 			for z.err == nil {
 				z.readByte()
 			}
+			z.data.end = z.raw.end
 			z.textIsRaw = true
 		} else {
 			z.readRawOrRCDATA()
 		}
 		if z.data.end > z.data.start {
 			z.tt = TextToken
+			z.convertNUL = true
 			return z.tt
 		}
 	}
 	z.textIsRaw = false
+	z.convertNUL = false
 
 loop:
 	for {
@@ -731,6 +737,11 @@ func convertNewlines(s []byte) []byte {
 	return s
 }
 
+var (
+	nul         = []byte("\x00")
+	replacement = []byte("\ufffd")
+)
+
 // Text returns the unescaped text of a text, comment or doctype token. The
 // contents of the returned slice may change on the next call to Next.
 func (z *Tokenizer) Text() []byte {
@@ -740,6 +751,9 @@ func (z *Tokenizer) Text() []byte {
 		z.data.start = z.raw.end
 		z.data.end = z.raw.end
 		s = convertNewlines(s)
+		if z.convertNUL && bytes.Contains(s, nul) {
+			s = bytes.Replace(s, nul, replacement, -1)
+		}
 		if !z.textIsRaw {
 			s = unescape(s, false)
 		}
-- 
2.30.9