source.go 5.26 KB
Newer Older
1 2 3 4
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

5 6 7 8 9
// This file implements source, a buffered rune reader
// which is specialized for the needs of the Go scanner:
// Contiguous sequences of runes (literals) are extracted
// directly as []byte without the need to re-encode the
// runes in UTF-8 (as would be necessary with bufio.Reader).
10 11 12
//
// This file is self-contained (go tool compile source.go
// compiles) and thus could be made into its own package.
13

14 15 16 17 18 19 20
package syntax

import (
	"io"
	"unicode/utf8"
)

21 22 23 24
// starting points for line and column numbers
const linebase = 1
const colbase = 1

25 26 27 28 29 30
// buf [...read...|...|...unread...|s|...free...]
//         ^      ^   ^            ^
//         |      |   |            |
//        suf     r0  r            w

type source struct {
31 32
	src  io.Reader
	errh func(line, pos uint, msg string)
33 34 35 36 37

	// source buffer
	buf         [4 << 10]byte
	offs        int   // source offset of buf
	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
38
	line0, line uint  // previous/current line
39
	col0, col   uint  // previous/current column (byte offsets from line start)
40
	ioerr       error // pending io error
41 42 43 44 45 46

	// literal buffer
	lit []byte // literal prefix
	suf int    // literal suffix; suf >= 0 means we are scanning a literal
}

47 48 49
// init initializes source to read from src and to report errors via errh.
// errh must not be nil.
func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
50 51 52 53 54 55
	s.src = src
	s.errh = errh

	s.buf[0] = utf8.RuneSelf // terminate with sentinel
	s.offs = 0
	s.r0, s.r, s.w = 0, 0, 0
56 57
	s.line0, s.line = 0, linebase
	s.col0, s.col = 0, colbase
58
	s.ioerr = nil
59 60 61 62 63

	s.lit = s.lit[:0]
	s.suf = -1
}

64 65 66
// ungetr ungets the most recently read rune.
func (s *source) ungetr() {
	s.r, s.line, s.col = s.r0, s.line0, s.col0
67 68
}

69 70 71 72 73 74 75 76
// ungetr2 is like ungetr but enables a 2nd ungetr.
// It must not be called if one of the runes seen
// was a newline.
func (s *source) ungetr2() {
	s.ungetr()
	// line must not have changed
	s.r0--
	s.col0--
77 78
}

79 80 81 82 83
func (s *source) error(msg string) {
	s.errh(s.line0, s.col0, msg)
}

// getr reads and returns the next rune.
84 85 86 87 88 89 90
//
// If a read or source encoding error occurs, getr
// calls the error handler installed with init.
// The handler must exist.
//
// The (line, col) position passed to the error handler
// is always at the current source reading position.
91 92
func (s *source) getr() rune {
redo:
93
	s.r0, s.line0, s.col0 = s.r, s.line, s.col
94 95 96 97 98 99 100

	// We could avoid at least one test that is always taken in the
	// for loop below by duplicating the common case code (ASCII)
	// here since we always have at least the sentinel (utf8.RuneSelf)
	// in the buffer. Measure and optimize if necessary.

	// make sure we have at least one rune in buffer, or we are at EOF
101
	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
102 103 104 105 106 107 108
		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
	}

	// common case: ASCII and enough bytes
	// (invariant: s.buf[s.w] == utf8.RuneSelf)
	if b := s.buf[s.r]; b < utf8.RuneSelf {
		s.r++
109 110 111
		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
		// remember the line offset instead and then compute the offset as needed
		// (which is less often).
112
		s.col++
113 114 115 116 117 118
		if b == 0 {
			s.error("invalid NUL character")
			goto redo
		}
		if b == '\n' {
			s.line++
119
			s.col = colbase
120 121 122 123 124 125
		}
		return rune(b)
	}

	// EOF
	if s.r == s.w {
126 127
		if s.ioerr != io.EOF {
			s.error(s.ioerr.Error())
128 129 130 131 132 133 134
		}
		return -1
	}

	// uncommon case: not ASCII
	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
	s.r += w
135
	s.col += uint(w)
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183

	if r == utf8.RuneError && w == 1 {
		s.error("invalid UTF-8 encoding")
		goto redo
	}

	// BOM's are only allowed as the first character in a file
	const BOM = 0xfeff
	if r == BOM {
		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
			s.error("invalid BOM in the middle of the file")
		}
		goto redo
	}

	return r
}

func (s *source) fill() {
	// Slide unread bytes to beginning but preserve last read char
	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
	// call, only for ".." character sequence and float literals
	// starting with ".").
	if s.r0 > 1 {
		// save literal prefix, if any
		// (We see at most one ungetr call while reading
		// a literal, so make sure s.r0 remains in buf.)
		if s.suf >= 0 {
			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
			s.suf = 1 // == s.r0 after slide below
		}
		s.offs += s.r0 - 1
		r := s.r - s.r0 + 1 // last read char plus one byte
		s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
		s.r = r
		s.r0 = 1
	}

	// read more data: try a limited number of times
	for i := 100; i > 0; i-- {
		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
		if n < 0 {
			panic("negative read") // incorrect underlying io.Reader implementation
		}
		s.w += n
		if n > 0 || err != nil {
			s.buf[s.w] = utf8.RuneSelf // sentinel
			if err != nil {
184
				s.ioerr = err
185 186 187 188 189
			}
			return
		}
	}

190
	s.ioerr = io.ErrNoProgress
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
}

func (s *source) startLit() {
	s.suf = s.r0
	s.lit = s.lit[:0] // reuse lit
}

func (s *source) stopLit() []byte {
	lit := s.buf[s.suf:s.r]
	if len(s.lit) > 0 {
		lit = append(s.lit, lit...)
	}
	s.suf = -1 // no pending literal
	return lit
}