ignore UTF-8 BOMs at the beginning of source files

13bbc206 · Stefan Behnel · fe78837b · 13bbc206 · 13bbc206 · 13bbc206
Commit 13bbc206 authored Sep 30, 2013 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 6 deletions

Cython/Utils.py Cython/Utils.py +21 -4

runtests.py runtests.py +7 -2

tests/compile/utf8bom.pyx tests/compile/utf8bom.pyx +8 -0

No files found.
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -215,6 +215,17 @@ def detect_opened_file_encoding(f):
                return encoding.group(1)
    return "UTF-8"

+
+def skip_bom(f):
+    """
+    Read past a BOM at the beginning of a source file.
+    This could be added to the scanner, but it's *substantially* easier
+    to keep it at this level.
+    """
+    if f.read(1) != u'\uFEFF':
+        f.seek(0)
+
+
 normalise_newlines = re.compile(u'\r\n?|\n').sub


@@ -264,6 +275,7 @@ if sys.version_info >= (2,6):
    except ImportError:
        pass

+
 def open_source_file(source_filename, mode="r",
                     encoding=None, error_handling=None,
                     require_normalised_newlines=True):
@@ -272,8 +284,11 @@ def open_source_file(source_filename, mode="r",
        # it's UTF-8.
        f = open_source_file(source_filename, encoding="UTF-8", mode=mode, error_handling='ignore')
        encoding = detect_opened_file_encoding(f)
-        if encoding == "UTF-8" and error_handling=='ignore' and require_normalised_newlines:
+        if (encoding == "UTF-8"
+                and error_handling == 'ignore'
+                and require_normalised_newlines):
            f.seek(0)
+            skip_bom(f)
            return f
        else:
            f.close()
@@ -290,15 +305,17 @@ def open_source_file(source_filename, mode="r",
            pass
    #
    if io is not None:
-        return io.open(source_filename, mode=mode,
-                       encoding=encoding, errors=error_handling)
+        stream = io.open(source_filename, mode=mode,
+                         encoding=encoding, errors=error_handling)
    else:
        # codecs module doesn't have universal newline support
        stream = codecs.open(source_filename, mode=mode,
                             encoding=encoding, errors=error_handling)
        if require_normalised_newlines:
            stream = NormalisedNewlineStream(stream)
-        return stream
+    skip_bom(stream)
+    return stream
+

 def open_source_from_loader(loader,
                            source_filename,

--- a/runtests.py
+++ b/runtests.py
@@ -277,6 +277,9 @@ TEST_SUPPORT_DIR = 'testsupport'

 BACKENDS = ['c', 'cpp']

+UTF8_BOM_BYTES = r'\xef\xbb\xbf'.encode('ISO-8859-1').decode('unicode_escape')
+
+
 def memoize(f):
    uncomputed = object()
    f._cache = {}
@@ -287,13 +290,15 @@ def memoize(f):
        return res
    return func

+
 @memoize
 def parse_tags(filepath):
    tags = defaultdict(list)
-    f = io_open(filepath, encoding='ISO-8859-1', errors='replace')
+    f = io_open(filepath, encoding='ISO-8859-1', errors='ignore')
    try:
        for line in f:
-            line = line.strip()
+            # ignore BOM-like bytes and whitespace
+            line = line.lstrip(UTF8_BOM_BYTES).strip()
            if not line:
                continue
            if line[0] != '#':

--- a/tests/compile/utf8bom.pyx
+++ b/tests/compile/utf8bom.pyx
+# coding: utf-8
+# mode: compile
+
+# this file starts with a UTF-8 encoded BOM
+# the only thing we test is that it properly compiles
+
+def test():
+    pass