merged in latest cython-devel

4856376e · Stefan Behnel · 02fa6935 · c2de390d · 4856376e · 4856376e
Commit 4856376e authored Mar 20, 2010 by Stefan Behnel
4 changed files
--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
        self._cmp_name = filename
    
    def get_lines(self, encoding=None, error_handling=None):
-        if not encoding:
-            return Utils.open_source_file(self.filename)
-        else:
-            return codecs.open(self.filename, "rU", encoding=encoding,
-                               errors=error_handling)
+        return Utils.open_source_file(
+            self.filename, encoding=encoding,
+            error_handling=error_handling,
+            # newline normalisation is costly before Py2.6
+            require_normalised_newlines=False)
    
    def get_description(self):
        return self.filename

--- a/Cython/Plex/Scanners.py
+++ b/Cython/Plex/Scanners.py
@@ -7,11 +7,12 @@
 #
 #=======================================================================

+import cython
+cython.declare(BOL=object, EOL=object, EOF=object)
+
 import Errors
 from Regexps import BOL, EOL, EOF

-import cython
-
 class Scanner(object):
  """
  A Scanner is used to read tokens from a stream of characters
@@ -77,7 +78,7 @@ class Scanner(object):
    """
    self.trace = 0

-    self.buffer = ''
+    self.buffer = u''
    self.buf_start_pos = 0
    self.next_pos = 0
    self.cur_pos = 0
@@ -137,15 +138,15 @@ class Scanner(object):
      if self.trace:
        print("Scanner: read: Performing %s %d:%d" % (
          action, self.start_pos, self.cur_pos))
-      base = self.buf_start_pos
-      text = self.buffer[self.start_pos - base : self.cur_pos - base]
+      text = self.buffer[self.start_pos - self.buf_start_pos :
+                         self.cur_pos   - self.buf_start_pos]
      return (text, action)
    else:
      if self.cur_pos == self.start_pos:
        if self.cur_char is EOL:
          self.next_char()
        if self.cur_char is None or self.cur_char is EOF:
-          return ('', None)
+          return (u'', None)
      raise Errors.UnrecognizedInput(self, self.state_name)

  def run_machine_inlined(self):
@@ -205,9 +206,9 @@ class Scanner(object):
              c = buffer[buf_index]
              next_pos = next_pos + 1
            else:
-              c = ''
+              c = u''
          # End inlined: c = self.read_char()
-          if c == '\n':
+          if c == u'\n':
            cur_char = EOL
            input_state = 2
          elif not c:
@@ -216,7 +217,7 @@ class Scanner(object):
          else:
            cur_char = c
        elif input_state == 2:
-          cur_char = '\n'
+          cur_char = u'\n'
          input_state = 3
        elif input_state == 3:
          cur_line = cur_line + 1
@@ -227,7 +228,7 @@ class Scanner(object):
          cur_char = EOF
          input_state = 5
        else: # input_state = 5
-          cur_char = ''
+          cur_char = u''
        # End inlined self.next_char()
      else: # not new_state
        if trace: #TRACE#
@@ -258,7 +259,7 @@ class Scanner(object):
    if input_state == 1:
      self.cur_pos = self.next_pos
      c = self.read_char()
-      if c == '\n':
+      if c == u'\n':
        self.cur_char = EOL
        self.input_state = 2
      elif not c:
@@ -267,7 +268,7 @@ class Scanner(object):
      else:
        self.cur_char = c
    elif input_state == 2:
-      self.cur_char = '\n'
+      self.cur_char = u'\n'
      self.input_state = 3
    elif input_state == 3:
      self.cur_line = self.cur_line + 1
@@ -278,7 +279,7 @@ class Scanner(object):
      self.cur_char = EOF
      self.input_state = 5
    else: # input_state = 5
-      self.cur_char = ''
+      self.cur_char = u''
    if self.trace:
      print("--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)))


--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search

 def detect_file_encoding(source_filename):
    # PEPs 263 and 3120
-    f = codecs.open(source_filename, "rU", encoding="UTF-8")
+    f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
    try:
        chars = []
        for i in range(2):
@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
        f.close()
    return "UTF-8"

-def open_source_file(source_filename, mode="rU"):
-    encoding = detect_file_encoding(source_filename)
-    return codecs.open(source_filename, mode=mode, encoding=encoding)
+normalise_newlines = re.compile(u'\r\n?|\n').sub
+
+class NormalisedNewlineStream(object):
+  """The codecs module doesn't provide universal newline support.
+  This class is used as a stream wrapper that provides this
+  functionality.  The new 'io' in Py2.6+/3.1+ supports this out of the
+  box.
+  """
+  def __init__(self, stream):
+    # let's assume .read() doesn't change
+    self._read = stream.read
+    self.close = stream.close
+    self.encoding = getattr(stream, 'encoding', 'UTF-8')
+
+  def read(self, count):
+    data = self._read(count)
+    if u'\r' not in data:
+      return data
+    if data.endswith(u'\r'):
+      # may be missing a '\n'
+      data += self._read(1)
+    return normalise_newlines(u'\n', data)
+
+  def readlines(self):
+    content = []
+    data = self._read(0x1000)
+    while data:
+        content.append(data)
+        data = self._read(0x1000)
+    return u''.join(content).split(u'\n')
+
+try:
+    from io import open as io_open
+except ImportError:
+    io_open = None
+
+def open_source_file(source_filename, mode="r",
+                     encoding=None, error_handling=None,
+                     require_normalised_newlines=True):
+    if encoding is None:
+        encoding = detect_file_encoding(source_filename)
+    if io_open is not None:
+        return io_open(source_filename, mode=mode,
+                       encoding=encoding, errors=error_handling)
+    else:
+        # codecs module doesn't have universal newline support
+        stream = codecs.open(source_filename, mode=mode,
+                             encoding=encoding, errors=error_handling)
+        if require_normalised_newlines:
+            stream = NormalisedNewlineStream(stream)
+        return stream

 def str_to_number(value):
    # note: this expects a string as input that was accepted by the

--- a/tests/compile/msvc_strings.pyx
+++ b/tests/compile/msvc_strings.pyx