Commit 743df03d authored by Stefan Behnel's avatar Stefan Behnel

Simplify encoding detection and source file opening by starting off reading...

Simplify encoding detection and source file opening by starting off reading bytes, and only switching to decoding once we know which encoding to use.
parent 04038f87
...@@ -10,6 +10,11 @@ try: ...@@ -10,6 +10,11 @@ try:
except ImportError: except ImportError:
basestring = str basestring = str
try:
FileNotFoundError
except NameError:
FileNotFoundError = OSError
import os import os
import sys import sys
import re import re
...@@ -233,43 +238,28 @@ def decode_filename(filename): ...@@ -233,43 +238,28 @@ def decode_filename(filename):
# support for source file encoding detection # support for source file encoding detection
_match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search _match_file_encoding = re.compile(b"coding[:=]\s*([-\w.]+)").search
def detect_file_encoding(source_filename):
f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
try:
return detect_opened_file_encoding(f)
finally:
f.close()
def detect_opened_file_encoding(f): def detect_opened_file_encoding(f):
# PEPs 263 and 3120 # PEPs 263 and 3120
# Most of the time the first two lines fall in the first 250 chars, # Most of the time the first two lines fall in the first couple of hundred chars,
# and this bulk read/split is much faster. # and this bulk read/split is much faster.
lines = f.read(250).split(u"\n") lines = ()
if len(lines) > 1: start = b''
while len(lines) < 3:
data = f.read(500)
start += data
lines = start.split(b"\n")
if not data:
break
m = _match_file_encoding(lines[0]) m = _match_file_encoding(lines[0])
if m: if m:
return m.group(1) return m.group(1).decode('iso8859-1')
elif len(lines) > 2: elif len(lines) > 1:
m = _match_file_encoding(lines[1]) m = _match_file_encoding(lines[1])
if m: if m:
return m.group(1) return m.group(1).decode('iso8859-1')
else:
return "UTF-8"
# Fallback to one-char-at-a-time detection.
f.seek(0)
chars = []
for i in range(2):
c = f.read(1)
while c and c != u'\n':
chars.append(c)
c = f.read(1)
encoding = _match_file_encoding(u''.join(chars))
if encoding:
return encoding.group(1)
return "UTF-8" return "UTF-8"
...@@ -283,32 +273,33 @@ def skip_bom(f): ...@@ -283,32 +273,33 @@ def skip_bom(f):
f.seek(0) f.seek(0)
def open_source_file(source_filename, mode="r", def open_source_file(source_filename, encoding=None, error_handling=None):
encoding=None, error_handling=None): stream = None
try:
if encoding is None: if encoding is None:
# Most of the time the coding is unspecified, so be optimistic that # Most of the time the encoding is not specified, so try hard to open the file only once.
# it's UTF-8. f = io.open(source_filename, 'rb')
f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore')
encoding = detect_opened_file_encoding(f) encoding = detect_opened_file_encoding(f)
if encoding == "UTF-8" and error_handling == 'ignore':
f.seek(0) f.seek(0)
skip_bom(f) stream = io.TextIOWrapper(f, encoding=encoding, errors=error_handling)
return f
else: else:
f.close() stream = io.open(source_filename, encoding=encoding, errors=error_handling)
if not os.path.exists(source_filename): except OSError:
if os.path.exists(source_filename):
raise # File is there, but something went wrong reading from it.
# Allow source files to be in zip files etc.
try: try:
loader = __loader__ loader = __loader__
if source_filename.startswith(loader.archive): if source_filename.startswith(loader.archive):
return open_source_from_loader( stream = open_source_from_loader(
loader, source_filename, loader, source_filename,
encoding, error_handling) encoding, error_handling)
except (NameError, AttributeError): except (NameError, AttributeError):
pass pass
stream = io.open(source_filename, mode=mode, if stream is None:
encoding=encoding, errors=error_handling) raise FileNotFoundError(source_filename)
skip_bom(stream) skip_bom(stream)
return stream return stream
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment