Commit 4856376e authored by Stefan Behnel's avatar Stefan Behnel

merged in latest cython-devel

parents 02fa6935 c2de390d
......@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
self._cmp_name = filename
def get_lines(self, encoding=None, error_handling=None):
if not encoding:
return Utils.open_source_file(self.filename)
else:
return codecs.open(self.filename, "rU", encoding=encoding,
errors=error_handling)
return Utils.open_source_file(
self.filename, encoding=encoding,
error_handling=error_handling,
# newline normalisation is costly before Py2.6
require_normalised_newlines=False)
def get_description(self):
return self.filename
......
......@@ -7,11 +7,12 @@
#
#=======================================================================
import cython
cython.declare(BOL=object, EOL=object, EOF=object)
import Errors
from Regexps import BOL, EOL, EOF
import cython
class Scanner(object):
"""
A Scanner is used to read tokens from a stream of characters
......@@ -77,7 +78,7 @@ class Scanner(object):
"""
self.trace = 0
self.buffer = ''
self.buffer = u''
self.buf_start_pos = 0
self.next_pos = 0
self.cur_pos = 0
......@@ -137,15 +138,15 @@ class Scanner(object):
if self.trace:
print("Scanner: read: Performing %s %d:%d" % (
action, self.start_pos, self.cur_pos))
base = self.buf_start_pos
text = self.buffer[self.start_pos - base : self.cur_pos - base]
text = self.buffer[self.start_pos - self.buf_start_pos :
self.cur_pos - self.buf_start_pos]
return (text, action)
else:
if self.cur_pos == self.start_pos:
if self.cur_char is EOL:
self.next_char()
if self.cur_char is None or self.cur_char is EOF:
return ('', None)
return (u'', None)
raise Errors.UnrecognizedInput(self, self.state_name)
def run_machine_inlined(self):
......@@ -205,9 +206,9 @@ class Scanner(object):
c = buffer[buf_index]
next_pos = next_pos + 1
else:
c = ''
c = u''
# End inlined: c = self.read_char()
if c == '\n':
if c == u'\n':
cur_char = EOL
input_state = 2
elif not c:
......@@ -216,7 +217,7 @@ class Scanner(object):
else:
cur_char = c
elif input_state == 2:
cur_char = '\n'
cur_char = u'\n'
input_state = 3
elif input_state == 3:
cur_line = cur_line + 1
......@@ -227,7 +228,7 @@ class Scanner(object):
cur_char = EOF
input_state = 5
else: # input_state = 5
cur_char = ''
cur_char = u''
# End inlined self.next_char()
else: # not new_state
if trace: #TRACE#
......@@ -258,7 +259,7 @@ class Scanner(object):
if input_state == 1:
self.cur_pos = self.next_pos
c = self.read_char()
if c == '\n':
if c == u'\n':
self.cur_char = EOL
self.input_state = 2
elif not c:
......@@ -267,7 +268,7 @@ class Scanner(object):
else:
self.cur_char = c
elif input_state == 2:
self.cur_char = '\n'
self.cur_char = u'\n'
self.input_state = 3
elif input_state == 3:
self.cur_line = self.cur_line + 1
......@@ -278,7 +279,7 @@ class Scanner(object):
self.cur_char = EOF
self.input_state = 5
else: # input_state = 5
self.cur_char = ''
self.cur_char = u''
if self.trace:
print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))
......
......@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
def detect_file_encoding(source_filename):
# PEPs 263 and 3120
f = codecs.open(source_filename, "rU", encoding="UTF-8")
f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
try:
chars = []
for i in range(2):
......@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
f.close()
return "UTF-8"
def open_source_file(source_filename, mode="rU"):
encoding = detect_file_encoding(source_filename)
return codecs.open(source_filename, mode=mode, encoding=encoding)
normalise_newlines = re.compile(u'\r\n?|\n').sub
class NormalisedNewlineStream(object):
"""The codecs module doesn't provide universal newline support.
This class is used as a stream wrapper that provides this
functionality. The new 'io' in Py2.6+/3.1+ supports this out of the
box.
"""
def __init__(self, stream):
# let's assume .read() doesn't change
self._read = stream.read
self.close = stream.close
self.encoding = getattr(stream, 'encoding', 'UTF-8')
def read(self, count):
data = self._read(count)
if u'\r' not in data:
return data
if data.endswith(u'\r'):
# may be missing a '\n'
data += self._read(1)
return normalise_newlines(u'\n', data)
def readlines(self):
content = []
data = self._read(0x1000)
while data:
content.append(data)
data = self._read(0x1000)
return u''.join(content).split(u'\n')
try:
from io import open as io_open
except ImportError:
io_open = None
def open_source_file(source_filename, mode="r",
encoding=None, error_handling=None,
require_normalised_newlines=True):
if encoding is None:
encoding = detect_file_encoding(source_filename)
if io_open is not None:
return io_open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
else:
# codecs module doesn't have universal newline support
stream = codecs.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
if require_normalised_newlines:
stream = NormalisedNewlineStream(stream)
return stream
def str_to_number(value):
# note: this expects a string as input that was accepted by the
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment