Commit 13bbc206 authored by Stefan Behnel's avatar Stefan Behnel

ignore UTF-8 BOMs at the beginning of source files

parent fe78837b
......@@ -215,6 +215,17 @@ def detect_opened_file_encoding(f):
return encoding.group(1)
return "UTF-8"
def skip_bom(f):
"""
Read past a BOM at the beginning of a source file.
This could be added to the scanner, but it's *substantially* easier
to keep it at this level.
"""
if f.read(1) != u'\uFEFF':
f.seek(0)
normalise_newlines = re.compile(u'\r\n?|\n').sub
......@@ -264,6 +275,7 @@ if sys.version_info >= (2,6):
except ImportError:
pass
def open_source_file(source_filename, mode="r",
encoding=None, error_handling=None,
require_normalised_newlines=True):
......@@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r",
# it's UTF-8.
f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore')
encoding = detect_opened_file_encoding(f)
if encoding == "UTF-8" and error_handling=='ignore' and require_normalised_newlines:
if (encoding == "UTF-8"
and error_handling == 'ignore'
and require_normalised_newlines):
f.seek(0)
skip_bom(f)
return f
else:
f.close()
......@@ -290,15 +305,17 @@ def open_source_file(source_filename, mode="r",
pass
#
if io is not None:
return io.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
stream = io.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
else:
# codecs module doesn't have universal newline support
stream = codecs.open(source_filename, mode=mode,
encoding=encoding, errors=error_handling)
if require_normalised_newlines:
stream = NormalisedNewlineStream(stream)
return stream
skip_bom(stream)
return stream
def open_source_from_loader(loader,
source_filename,
......
......@@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport'
BACKENDS = ['c', 'cpp']
UTF8_BOM_BYTES = r'\xef\xbb\xbf'.encode('ISO-8859-1').decode('unicode_escape')
def memoize(f):
uncomputed = object()
f._cache = {}
......@@ -287,13 +290,15 @@ def memoize(f):
return res
return func
@memoize
def parse_tags(filepath):
tags = defaultdict(list)
f = io_open(filepath, encoding='ISO-8859-1', errors='replace')
f = io_open(filepath, encoding='ISO-8859-1', errors='ignore')
try:
for line in f:
line = line.strip()
# ignore BOM-like bytes and whitespace
line = line.lstrip(UTF8_BOM_BYTES).strip()
if not line:
continue
if line[0] != '#':
......
# coding: utf-8
# mode: compile
# this file starts with a UTF-8 encoded BOM
# the only thing we test is that it properly compiles
def test():
pass
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment