Commit 22e96054 authored by Robert Griesemer's avatar Robert Griesemer

scanner: match go/scanner and disallow NUL character;

also check for illegal UTF-8 sequences

R=rsc
CC=golang-dev
https://golang.org/cl/218061
parent 0485a999
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// A general-purpose scanner for text. Takes an io.Reader // A general-purpose scanner for UTF-8 encoded text. Takes an io.Reader
// providing the source which then can be tokenized through // providing the source which then can be tokenized through repeated
// repeated calls to the Scan function. // calls to the Scan function. For compatibility with existing tools,
// the NUL character is not allowed (implementation restriction).
// //
// By default, a Scanner skips white space and comments and // By default, a Scanner skips white space and comments and
// recognizes literals as defined by the Go language spec. // recognizes literals as defined by the Go language spec.
...@@ -245,13 +246,20 @@ func (s *Scanner) next() int { ...@@ -245,13 +246,20 @@ func (s *Scanner) next() int {
// uncommon case: not ASCII // uncommon case: not ASCII
var width int var width int
ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
if ch == utf8.RuneError && width == 1 {
s.error("illegal UTF-8 encoding")
}
s.srcPos += width - 1 s.srcPos += width - 1
} }
} }
s.srcPos++ s.srcPos++
s.column++ s.column++
if ch == '\n' { switch ch {
case 0:
// implementation restriction for compatibility with other tools
s.error("illegal character NUL")
case '\n':
s.line++ s.line++
s.column = 0 s.column = 0
} }
......
...@@ -226,7 +226,7 @@ var tokenList = []token{ ...@@ -226,7 +226,7 @@ var tokenList = []token{
token{String, "`" + f100 + "`"}, token{String, "`" + f100 + "`"},
token{Comment, "// individual characters\n"}, token{Comment, "// individual characters\n"},
token{'\x00', "\x00"}, // NUL character is not allowed
token{'\x01', "\x01"}, token{'\x01', "\x01"},
token{' ' - 1, string(' ' - 1)}, token{' ' - 1, string(' ' - 1)},
token{'+', "+"}, token{'+', "+"},
...@@ -390,7 +390,8 @@ func TestScanNext(t *testing.T) { ...@@ -390,7 +390,8 @@ func TestScanNext(t *testing.T) {
func TestScanWhitespace(t *testing.T) { func TestScanWhitespace(t *testing.T) {
var buf bytes.Buffer var buf bytes.Buffer
var ws uint64 var ws uint64
for ch := byte(0); ch < ' '; ch++ { // start at 1, NUL character is not allowed
for ch := byte(1); ch < ' '; ch++ {
buf.WriteByte(ch) buf.WriteByte(ch)
ws |= 1 << ch ws |= 1 << ch
} }
...@@ -442,6 +443,8 @@ func TestError(t *testing.T) { ...@@ -442,6 +443,8 @@ func TestError(t *testing.T) {
testError(t, "`abc", "literal not terminated", String) testError(t, "`abc", "literal not terminated", String)
testError(t, `//`, "comment not terminated", EOF) testError(t, `//`, "comment not terminated", EOF)
testError(t, `/*/`, "comment not terminated", EOF) testError(t, `/*/`, "comment not terminated", EOF)
testError(t, `"abc`+"\x00"+`def"`, "illegal character NUL", String)
testError(t, `"abc`+"\xff"+`def"`, "illegal UTF-8 encoding", String)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment