Commit c2de390d authored by Stefan Behnel's avatar Stefan Behnel

fix universal newline parsing (which the codecs module doesn't support), use...

fix universal newline parsing (which the codecs module doesn't support), use the fast 'io' module for file reading in Py2.6 and later
parent 76c5ac96
...@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor): ...@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
self._cmp_name = filename self._cmp_name = filename
def get_lines(self, encoding=None, error_handling=None): def get_lines(self, encoding=None, error_handling=None):
if not encoding: return Utils.open_source_file(
return Utils.open_source_file(self.filename) self.filename, encoding=encoding,
else: error_handling=error_handling,
return codecs.open(self.filename, "rU", encoding=encoding, # newline normalisation is costly before Py2.6
errors=error_handling) require_normalised_newlines=False)
def get_description(self): def get_description(self):
return self.filename return self.filename
......
...@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search ...@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
def detect_file_encoding(source_filename): def detect_file_encoding(source_filename):
# PEPs 263 and 3120 # PEPs 263 and 3120
f = codecs.open(source_filename, "rU", encoding="UTF-8") f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
try: try:
chars = [] chars = []
for i in range(2): for i in range(2):
...@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename): ...@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
f.close() f.close()
return "UTF-8" return "UTF-8"
def open_source_file(source_filename, mode="rU"): normalise_newlines = re.compile(u'\r\n?|\n').sub
encoding = detect_file_encoding(source_filename)
return codecs.open(source_filename, mode=mode, encoding=encoding) class NormalisedNewlineStream(object):
"""The codecs module doesn't provide universal newline support.
This class is used as a stream wrapper that provides this
functionality. The new 'io' in Py2.6+/3.1+ supports this out of the
box.
"""
def __init__(self, stream):
# let's assume .read() doesn't change
self._read = stream.read
self.close = stream.close
self.encoding = getattr(stream, 'encoding', 'UTF-8')
def read(self, count):
data = self._read(count)
if u'\r' not in data:
return data
if data.endswith(u'\r'):
# may be missing a '\n'
data += self._read(1)
return normalise_newlines(u'\n', data)
def readlines(self):
content = []
data = self._read(0x1000)
while data:
content.append(data)
data = self._read(0x1000)
return u''.join(content).split(u'\n')
try:
from io import open as io_open
except ImportError:
io_open = None
def open_source_file(source_filename, mode="r",
encoding=None, error_handling=None,
require_normalised_newlines=True):
if encoding is None:
encoding = detect_file_encoding(source_filename)
if io_open is not None:
return io_open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
else:
# codecs module doesn't have universal newline support
stream = codecs.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
if require_normalised_newlines:
stream = NormalisedNewlineStream(stream)
return stream
def long_literal(value): def long_literal(value):
if isinstance(value, basestring): if isinstance(value, basestring):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment