Commit 11740e19 authored by Robert Griesemer's avatar Robert Griesemer

go/scanner: minimal non-terminated literals

Consume as little as possible input when encountering
non-terminated rune, string, and raw string literals.
The old code consumed at least one extra character
which could lead to worse error recovery when parsing
erroneous sources.

Also made error messages in those cases more consistent.

Fixes #7091.

R=adonovan
CC=golang-codereviews
https://golang.org/cl/50630043
parent a2edc469
...@@ -402,29 +402,30 @@ func (s *Scanner) scanEscape(quote rune) { ...@@ -402,29 +402,30 @@ func (s *Scanner) scanEscape(quote rune) {
} }
} }
func (s *Scanner) scanChar() string { func (s *Scanner) scanRune() string {
// '\'' opening already consumed // '\'' opening already consumed
offs := s.offset - 1 offs := s.offset - 1
n := 0 n := 0
for s.ch != '\'' { for {
ch := s.ch ch := s.ch
n++
s.next()
if ch == '\n' || ch < 0 { if ch == '\n' || ch < 0 {
s.error(offs, "character literal not terminated") s.error(offs, "rune literal not terminated")
n = 1 n = 1 // avoid further errors
break
}
s.next()
if ch == '\'' {
break break
} }
n++
if ch == '\\' { if ch == '\\' {
s.scanEscape('\'') s.scanEscape('\'')
} }
} }
s.next()
if n != 1 { if n != 1 {
s.error(offs, "illegal character literal") s.error(offs, "illegal rune literal")
} }
return string(s.src[offs:s.offset]) return string(s.src[offs:s.offset])
...@@ -434,11 +435,14 @@ func (s *Scanner) scanString() string { ...@@ -434,11 +435,14 @@ func (s *Scanner) scanString() string {
// '"' opening already consumed // '"' opening already consumed
offs := s.offset - 1 offs := s.offset - 1
for s.ch != '"' { for {
ch := s.ch ch := s.ch
s.next()
if ch == '\n' || ch < 0 { if ch == '\n' || ch < 0 {
s.error(offs, "string not terminated") s.error(offs, "string literal not terminated")
break
}
s.next()
if ch == '"' {
break break
} }
if ch == '\\' { if ch == '\\' {
...@@ -446,8 +450,6 @@ func (s *Scanner) scanString() string { ...@@ -446,8 +450,6 @@ func (s *Scanner) scanString() string {
} }
} }
s.next()
return string(s.src[offs:s.offset]) return string(s.src[offs:s.offset])
} }
...@@ -468,20 +470,21 @@ func (s *Scanner) scanRawString() string { ...@@ -468,20 +470,21 @@ func (s *Scanner) scanRawString() string {
offs := s.offset - 1 offs := s.offset - 1
hasCR := false hasCR := false
for s.ch != '`' { for {
ch := s.ch ch := s.ch
if ch < 0 {
s.error(offs, "raw string literal not terminated")
break
}
s.next() s.next()
if ch == '`' {
break
}
if ch == '\r' { if ch == '\r' {
hasCR = true hasCR = true
} }
if ch < 0 {
s.error(offs, "string not terminated")
break
}
} }
s.next()
lit := s.src[offs:s.offset] lit := s.src[offs:s.offset]
if hasCR { if hasCR {
lit = stripCR(lit) lit = stripCR(lit)
...@@ -617,7 +620,7 @@ scanAgain: ...@@ -617,7 +620,7 @@ scanAgain:
case '\'': case '\'':
insertSemi = true insertSemi = true
tok = token.CHAR tok = token.CHAR
lit = s.scanChar() lit = s.scanRune()
case '`': case '`':
insertSemi = true insertSemi = true
tok = token.STRING tok = token.STRING
......
...@@ -631,7 +631,7 @@ type errorCollector struct { ...@@ -631,7 +631,7 @@ type errorCollector struct {
pos token.Position // last error position encountered pos token.Position // last error position encountered
} }
func checkError(t *testing.T, src string, tok token.Token, pos int, err string) { func checkError(t *testing.T, src string, tok token.Token, pos int, lit, err string) {
var s Scanner var s Scanner
var h errorCollector var h errorCollector
eh := func(pos token.Position, msg string) { eh := func(pos token.Position, msg string) {
...@@ -640,7 +640,7 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string) ...@@ -640,7 +640,7 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string)
h.pos = pos h.pos = pos
} }
s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, ScanComments|dontInsertSemis) s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, ScanComments|dontInsertSemis)
_, tok0, _ := s.Scan() _, tok0, lit0 := s.Scan()
_, tok1, _ := s.Scan() _, tok1, _ := s.Scan()
if tok0 != tok { if tok0 != tok {
t.Errorf("%q: got %s, expected %s", src, tok0, tok) t.Errorf("%q: got %s, expected %s", src, tok0, tok)
...@@ -648,6 +648,9 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string) ...@@ -648,6 +648,9 @@ func checkError(t *testing.T, src string, tok token.Token, pos int, err string)
if tok1 != token.EOF { if tok1 != token.EOF {
t.Errorf("%q: got %s, expected EOF", src, tok1) t.Errorf("%q: got %s, expected EOF", src, tok1)
} }
if tok0 != token.ILLEGAL && lit0 != lit {
t.Errorf("%q: got literal %q, expected %q", src, lit0, lit)
}
cnt := 0 cnt := 0
if err != "" { if err != "" {
cnt = 1 cnt = 1
...@@ -667,43 +670,49 @@ var errors = []struct { ...@@ -667,43 +670,49 @@ var errors = []struct {
src string src string
tok token.Token tok token.Token
pos int pos int
lit string
err string err string
}{ }{
{"\a", token.ILLEGAL, 0, "illegal character U+0007"}, {"\a", token.ILLEGAL, 0, "", "illegal character U+0007"},
{`#`, token.ILLEGAL, 0, "illegal character U+0023 '#'"}, {`#`, token.ILLEGAL, 0, "", "illegal character U+0023 '#'"},
{`…`, token.ILLEGAL, 0, "illegal character U+2026 '…'"}, {`…`, token.ILLEGAL, 0, "", "illegal character U+2026 '…'"},
{`' '`, token.CHAR, 0, ""}, {`' '`, token.CHAR, 0, `' '`, ""},
{`''`, token.CHAR, 0, "illegal character literal"}, {`''`, token.CHAR, 0, `''`, "illegal rune literal"},
{`'\8'`, token.CHAR, 2, "unknown escape sequence"}, {`'123'`, token.CHAR, 0, `'123'`, "illegal rune literal"},
{`'\08'`, token.CHAR, 3, "illegal character in escape sequence"}, {`'\8'`, token.CHAR, 2, `'\8'`, "unknown escape sequence"},
{`'\x0g'`, token.CHAR, 4, "illegal character in escape sequence"}, {`'\08'`, token.CHAR, 3, `'\08'`, "illegal character in escape sequence"},
{`'\Uffffffff'`, token.CHAR, 2, "escape sequence is invalid Unicode code point"}, {`'\x0g'`, token.CHAR, 4, `'\x0g'`, "illegal character in escape sequence"},
{`'`, token.CHAR, 0, "character literal not terminated"}, {`'\Uffffffff'`, token.CHAR, 2, `'\Uffffffff'`, "escape sequence is invalid Unicode code point"},
{`""`, token.STRING, 0, ""}, {`'`, token.CHAR, 0, `'`, "rune literal not terminated"},
{`"`, token.STRING, 0, "string not terminated"}, {"'\n", token.CHAR, 0, "'", "rune literal not terminated"},
{"``", token.STRING, 0, ""}, {"'\n ", token.CHAR, 0, "'", "rune literal not terminated"},
{"`", token.STRING, 0, "string not terminated"}, {`""`, token.STRING, 0, `""`, ""},
{"/**/", token.COMMENT, 0, ""}, {`"abc`, token.STRING, 0, `"abc`, "string literal not terminated"},
{"/*", token.COMMENT, 0, "comment not terminated"}, {"\"abc\n", token.STRING, 0, `"abc`, "string literal not terminated"},
{"077", token.INT, 0, ""}, {"\"abc\n ", token.STRING, 0, `"abc`, "string literal not terminated"},
{"078.", token.FLOAT, 0, ""}, {"``", token.STRING, 0, "``", ""},
{"07801234567.", token.FLOAT, 0, ""}, {"`", token.STRING, 0, "`", "raw string literal not terminated"},
{"078e0", token.FLOAT, 0, ""}, {"/**/", token.COMMENT, 0, "/**/", ""},
{"078", token.INT, 0, "illegal octal number"}, {"/*", token.COMMENT, 0, "/*", "comment not terminated"},
{"07800000009", token.INT, 0, "illegal octal number"}, {"077", token.INT, 0, "077", ""},
{"0x", token.INT, 0, "illegal hexadecimal number"}, {"078.", token.FLOAT, 0, "078.", ""},
{"0X", token.INT, 0, "illegal hexadecimal number"}, {"07801234567.", token.FLOAT, 0, "07801234567.", ""},
{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"}, {"078e0", token.FLOAT, 0, "078e0", ""},
{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"}, {"078", token.INT, 0, "078", "illegal octal number"},
{"\ufeff\ufeff", token.ILLEGAL, 3, "illegal byte order mark"}, // only first BOM is ignored {"07800000009", token.INT, 0, "07800000009", "illegal octal number"},
{"//\ufeff", token.COMMENT, 2, "illegal byte order mark"}, // only first BOM is ignored {"0x", token.INT, 0, "0x", "illegal hexadecimal number"},
{"'\ufeff" + `'`, token.CHAR, 1, "illegal byte order mark"}, // only first BOM is ignored {"0X", token.INT, 0, "0X", "illegal hexadecimal number"},
{`"` + "abc\ufeffdef" + `"`, token.STRING, 4, "illegal byte order mark"}, // only first BOM is ignored {"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"},
{"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"},
{"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"}, // only first BOM is ignored
{"//\ufeff", token.COMMENT, 2, "//\ufeff", "illegal byte order mark"}, // only first BOM is ignored
{"'\ufeff" + `'`, token.CHAR, 1, "'\ufeff" + `'`, "illegal byte order mark"}, // only first BOM is ignored
{`"` + "abc\ufeffdef" + `"`, token.STRING, 4, `"` + "abc\ufeffdef" + `"`, "illegal byte order mark"}, // only first BOM is ignored
} }
func TestScanErrors(t *testing.T) { func TestScanErrors(t *testing.T) {
for _, e := range errors { for _, e := range errors {
checkError(t, e.src, e.tok, e.pos, e.err) checkError(t, e.src, e.tok, e.pos, e.lit, e.err)
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment