Merge circular imports stuff

d0f03098 · Robert Bradshaw · 6fd96df6 · a6fa9d1a · d0f03098 · d0f03098
Commit d0f03098 authored Apr 26, 2008 by Robert Bradshaw
54 changed files
--- a/Cython/Compiler/CmdLine.py
+++ b/Cython/Compiler/CmdLine.py
@@ -134,6 +134,9 @@ def parse_command_line(args):
            arg = pop_arg()
            if arg.endswith(".pyx"):
                sources.append(arg)
+            elif arg.endswith(".py"):
+                # maybe do some other stuff, but this should work for now
+                sources.append(arg)
            elif arg.endswith(".o"):
                options.objects.append(arg)
            else:

--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -2,9 +2,10 @@
 #   Pyrex - Code output module
 #

+import codecs
 import Naming
 import Options
-from Cython.Utils import open_new_file
+from Cython.Utils import open_new_file, open_source_file
 from PyrexTypes import py_object_type, typecast
 from TypeSlots import method_coexist

@@ -85,23 +86,24 @@ class CCodeWriter:
    def indent(self):
        self.f.write("  " * self.level)

+    def get_py_version_hex(self, pyversion):
+        return "0x%02X%02X%02X%02X" % (tuple(pyversion) + (0,0,0,0))[:4]
+
    def file_contents(self, file):
        try:
            return self.input_file_contents[file]
        except KeyError:
-            F = [line.replace('*/', '*[inserted by cython to avoid comment closer]/')
-                 for line in open(file).readlines()]
+            F = [line.encode('ASCII', 'replace').replace(
+                    '*/', '*[inserted by cython to avoid comment closer]/')
+                 for line in open_source_file(file)]
            self.input_file_contents[file] = F
            return F

-    def get_py_version_hex(self, pyversion):
-        return "0x%02X%02X%02X%02X" % (tuple(pyversion) + (0,0,0,0))[:4]
-
    def mark_pos(self, pos):
        if pos is None:
            return
-        file, line, col = pos
-        contents = self.file_contents(file)
+        filename, line, col = pos
+        contents = self.file_contents(filename)

        context = ''
        for i in range(max(0,line-3), min(line+2, len(contents))):
@@ -110,7 +112,7 @@ class CCodeWriter:
                s = s.rstrip() + '             # <<<<<<<<<<<<<< ' + '\n'
            context += " * " + s

-        marker = '"%s":%s\n%s' % (file, line, context)
+        marker = '"%s":%d\n%s' % (filename.encode('ASCII', 'replace'), line, context)
        if self.last_marker != marker:
            self.marker = marker


--- a/Cython/Compiler/ControlFlow.py
+++ b/Cython/Compiler/ControlFlow.py
-import bisect
+import bisect, sys

 # This module keeps track of arbitrary "states" at any point of the code. 
 # A state is considered known if every path to the given point agrees on
@@ -13,6 +13,8 @@ import bisect
 # redesigned. It doesn't take return, raise, continue, or break into 
 # account. 

+_END_POS = ((unichr(sys.maxunicode)*10),())
+
 class ControlFlow:

    def __init__(self, start_pos, incoming, parent):
@@ -22,7 +24,7 @@ class ControlFlow:
            parent = incoming.parent
        self.parent = parent
        self.tip = {}
-        self.end_pos = ((),)
+        self.end_pos = _END_POS
        
    def start_branch(self, pos):
        self.end_pos = pos
@@ -40,10 +42,10 @@ class ControlFlow:
        self.parent.end_pos = pos
        return LinearControlFlow(pos, self.parent)
        
-    def get_state(self, item, pos=((),())):
+    def get_state(self, item, pos=_END_POS):
        return self.get_pos_state(item, pos)[1]
        
-    def get_pos_state(self, item, pos=((),())):
+    def get_pos_state(self, item, pos=_END_POS):
        # do some caching
        if pos > self.end_pos:
            try:
@@ -61,13 +63,13 @@ class LinearControlFlow(ControlFlow):
        self.events = {}
            
    def set_state(self, pos, item, state):
-        if self.tip.has_key(item):
+        if item in self.tip:
            del self.tip[item]
        if pos < self.start_pos:
            if self.incoming is not None:
                self.incoming.set_state(pos, item, state)
        else:
-            if self.events.has_key(item):
+            if item in self.events:
                event_list = self.events[item]
            else:
                event_list = []
@@ -77,7 +79,7 @@ class LinearControlFlow(ControlFlow):
        
    def _get_pos_state(self, item, pos):
        if pos > self.start_pos:
-            if self.events.has_key(item):
+            if item in self.events:
                event_list = self.events[item]
                for event in event_list[::-1]:
                    if event[0] < pos:
@@ -116,7 +118,7 @@ class BranchingControlFlow(ControlFlow):
        
    def set_state(self, pos, item, state):
    
-        if self.tip.has_key(item):
+        if item in self.tip:
            del self.tip[item]
        
        if pos < self.start_pos:
@@ -157,5 +159,3 @@ class BranchingControlFlow(ControlFlow):
        if self.incoming is not limit and self.incoming is not None:
            s = "%s\n%s" % (self.incoming.to_string(indent, limit=limit), s)
        return s
-        
-    
\ No newline at end of file
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -18,6 +18,29 @@ from Cython.Debugging import print_call_chain
 from DebugFlags import debug_disposal_code, debug_temp_alloc, \
    debug_coercion

+class EncodedString(unicode):
+    # unicode string subclass to keep track of the original encoding.
+    # 'encoding' is None for unicode strings and the source encoding
+    # otherwise
+    encoding = None
+
+    def byteencode(self):
+        assert self.encoding is not None
+        return self.encode(self.encoding)
+
+    def utf8encode(self):
+        assert self.encoding is None
+        return self.encode("UTF-8")
+
+    def is_unicode(self):
+        return self.encoding is None
+    is_unicode = property(is_unicode)
+
+#    def __eq__(self, other):
+#        return unicode.__eq__(self, other) and \
+#            getattr(other, 'encoding', '') == self.encoding
+
+
 class ExprNode(Node):
    #  subexprs     [string]     Class var holding names of subexpr node attrs
    #  type         PyrexType    Type of the result
@@ -669,7 +692,7 @@ class IntNode(ConstNode):
            return str(self.value)

    def compile_time_value(self, denv):
-        return int(self.value)
+        return int(self.value, 0)


 class FloatNode(ConstNode):
@@ -678,6 +701,17 @@ class FloatNode(ConstNode):
    def compile_time_value(self, denv):
        return float(self.value)
    
+    def calculate_result_code(self):
+        strval = str(self.value)
+        if strval == 'nan':
+            return "(Py_HUGE_VAL * 0)"
+        elif strval == 'inf':
+            return "Py_HUGE_VAL"
+        elif strval == '-inf':
+            return "(-Py_HUGE_VAL)"
+        else:
+            return strval
+

 class StringNode(ConstNode):
    #  entry   Symtab.Entry
@@ -685,15 +719,16 @@ class StringNode(ConstNode):
    type = PyrexTypes.c_char_ptr_type

    def compile_time_value(self, denv):
-        return eval('"%s"' % self.value)
+        return self.value
    
    def analyse_types(self, env):
        self.entry = env.add_string_const(self.value)
    
    def coerce_to(self, dst_type, env):
        if dst_type.is_int:
-            if not self.type.is_pyobject and len(self.value) == 1:
-                return CharNode(self.pos, value=self.value)
+            if not self.type.is_pyobject and len(self.entry.init) == 1:
+                # we use the *encoded* value here
+                return CharNode(self.pos, value=self.entry.init)
            else:
                error(self.pos, "Only coerce single-character ascii strings can be used as ints.")
                return self
@@ -776,7 +811,7 @@ class NameNode(AtomicExprNode):
        try:
            return denv.lookup(self.name)
        except KeyError:
-            error(self.pos, "Compile-time name '%s' not defined", self.name)
+            error(self.pos, "Compile-time name '%s' not defined" % self.name)
    
    def coerce_to(self, dst_type, env):
        #  If coercing to a generic pyobject and this is a builtin
@@ -4067,9 +4102,8 @@ bad:

 cpp_exception_utility_code = [
 """
-static int __Pyx_CppExn2PyErr(); /*proto*/
-""","""
-void __Pyx_CppExn2PyErr() {
+#ifndef __Pyx_CppExn2PyErr
+static void __Pyx_CppExn2PyErr() {
  try {
    if (PyErr_Occurred())
      ; // let the latest Python exn pass through and ignore the current one
@@ -4086,6 +4120,7 @@ void __Pyx_CppExn2PyErr() {
    PyErr_SetString(PyExc_RuntimeError, "Unknown exception");
  }
 }
-"""]
+#endif
+""",""]

 #------------------------------------------------------------------------------------
--- a/Cython/Compiler/Main.py
+++ b/Cython/Compiler/Main.py
@@ -2,12 +2,11 @@
 #   Cython Top Level
 #

-import os, sys, re
+import os, sys, re, codecs
 if sys.version_info[:2] < (2, 2):
    print >>sys.stderr, "Sorry, Cython requires Python 2.2 or later"
    sys.exit(1)

-import os
 from time import time
 import Version
 from Scanning import PyrexScanner
@@ -140,8 +139,17 @@ class Context:

    def parse(self, source_filename, type_names, pxd, full_module_name):
        # Parse the given source file and return a parse tree.
-        f = open(source_filename, "rU")
-        s = PyrexScanner(f, source_filename, 
+        f = Utils.open_source_file(source_filename, "rU")
+
+        if isinstance(source_filename, unicode):
+            name = source_filename
+        else:
+            filename_encoding = sys.getfilesystemencoding()
+            if filename_encoding is None:
+                filename_encoding = getdefaultencoding()
+            name = source_filename.decode(filename_encoding)
+
+        s = PyrexScanner(f, name, source_encoding = f.encoding,
                         type_names = type_names, context = self)
        try:
            tree = Parsing.p_module(s, pxd, full_module_name)

--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
@@ -793,13 +793,14 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
            "static void %s(PyObject *o) {"
                % scope.mangle_internal("tp_dealloc"))
        py_attrs = []
+        weakref_slot = scope.lookup_here("__weakref__")
        for entry in scope.var_entries:
-            if entry.type.is_pyobject and entry.name != "__weakref__":
+            if entry.type.is_pyobject and entry is not weakref_slot:
                py_attrs.append(entry)
-        if py_attrs or scope.lookup_here("__weakref__"):
+        if py_attrs or weakref_slot in scope.var_entries:
            self.generate_self_cast(scope, code)
        self.generate_usr_dealloc_call(scope, code)
-        if scope.lookup_here("__weakref__"):
+        if weakref_slot in scope.var_entries:
            code.putln("if (p->__weakref__) PyObject_ClearWeakRefs(o);")
        for entry in py_attrs:
            code.put_xdecref("p->%s" % entry.cname, entry.type)
@@ -1377,7 +1378,7 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
                        entry.pystring_cname,
                        entry.cname,
                        entry.cname,
-                        isinstance(entry.init, unicode)
+                        entry.type.is_unicode
                        ))
            code.putln(
                "{0, 0, 0, 0}")

--- a/Cython/Compiler/Nodes.py
+++ b/Cython/Compiler/Nodes.py
@@ -38,6 +38,30 @@ def relative_position(pos):
    """
    return (pos[0][absolute_path_length+1:], pos[1])

+def embed_position(pos, docstring):
+    if not Options.embed_pos_in_docstring:
+        return docstring
+    pos_line = u'File: %s (starting at line %s)' % relative_position(self.pos)
+    if docstring is None:
+        # unicode string
+        return ExprNodes.EncodedString(pos_line)
+
+    # make sure we can encode the filename in the docstring encoding
+    # otherwise make the docstring a unicode string
+    encoding = docstring.encoding
+    if encoding is not None:
+        try:
+            encoded_bytes = pos_line.encode(encoding)
+        except UnicodeEncodeError:
+            encoding = None
+
+    if not docstring:
+        # reuse the string encoding of the original docstring
+        doc = ExprNodes.EncodedString(pos_line)
+    else:
+        doc = ExprNodes.EncodedString(pos_line + u'\\n' + docstring)
+    doc.encoding = encoding
+    return doc

 class AttributeAccessor:
    """Used as the result of the Node.get_children_accessors() generator"""
@@ -1199,7 +1223,7 @@ class DefNode(FuncDefNode):
    # args          [CArgDeclNode]         formal arguments
    # star_arg      PyArgDeclNode or None  * argument
    # starstar_arg  PyArgDeclNode or None  ** argument
-    # doc           string or None
+    # doc           EncodedString or None
    # body          StatListNode
    #
    #  The following subnode is constructed internally
@@ -1357,17 +1381,12 @@ class DefNode(FuncDefNode):
            Naming.pyfunc_prefix + prefix + name
        entry.pymethdef_cname = \
            Naming.pymethdef_prefix + prefix + name
-        if not Options.docstrings:
-            self.entry.doc = None
-        else:
-            if Options.embed_pos_in_docstring:
-                entry.doc = 'File: %s (starting at line %s)'%relative_position(self.pos)
-                if not self.doc is None:
-                    entry.doc = entry.doc + '\\n' + self.doc
-            else:
-                entry.doc = self.doc
+        if Options.docstrings:
+            entry.doc = embed_position(self.pos, self.doc)
            entry.doc_cname = \
                Naming.funcdoc_prefix + prefix + name
+        else:
+            entry.doc = None

    def declare_arguments(self, env):
        for arg in self.args:
@@ -1897,7 +1916,7 @@ class OverrideCheckNode(StatNode):
 class PyClassDefNode(StatNode, BlockNode):
    #  A Python class definition.
    #
-    #  name     string          Name of the class
+    #  name     EncodedString   Name of the class
    #  doc      string or None
    #  body     StatNode        Attribute definition code
    #  entry    Symtab.Entry
@@ -1919,9 +1938,7 @@ class PyClassDefNode(StatNode, BlockNode):
        import ExprNodes
        self.dict = ExprNodes.DictNode(pos, key_value_pairs = [])
        if self.doc and Options.docstrings:
-            if Options.embed_pos_in_docstring:
-                doc = 'File: %s (starting at line %s)'%relative_position(self.pos)
-            doc = doc + '\\n' + self.doc
+            doc = embed_position(self.pos, self.doc)
            doc_node = ExprNodes.StringNode(pos, value = doc)
        else:
            doc_node = None
@@ -1961,7 +1978,7 @@ class PyClassDefNode(StatNode, BlockNode):
        self.dict.generate_disposal_code(code)


-class CClassDefNode(StatNode):
+class CClassDefNode(StatNode, BlockNode):
    #  An extension type definition.
    #
    #  visibility         'private' or 'public' or 'extern'
@@ -2034,11 +2051,7 @@ class CClassDefNode(StatNode):
        scope = self.entry.type.scope

        if self.doc and Options.docstrings:
-            if Options.embed_pos_in_docstring:
-                scope.doc = 'File: %s (starting at line %s)'%relative_position(self.pos)
-                scope.doc = scope.doc + '\\n' + self.doc
-            else:
-                scope.doc = self.doc
+            scope.doc = embed_position(self.pos, self.doc)

        if has_body:
            self.body.analyse_declarations(scope)
@@ -2054,6 +2067,7 @@ class CClassDefNode(StatNode):
            self.body.analyse_expressions(scope)
    
    def generate_function_definitions(self, env, code, transforms):
+        self.generate_py_string_decls(self.entry.type.scope, code)
        if self.body:
            self.body.generate_function_definitions(
                self.entry.type.scope, code, transforms)
@@ -2073,7 +2087,7 @@ class PropertyNode(StatNode):
    #  Definition of a property in an extension type.
    #
    #  name   string
-    #  doc    string or None    Doc string
+    #  doc    EncodedString or None    Doc string
    #  body   StatListNode
    
    child_attrs = ["body"]

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -2,7 +2,7 @@
 #   Pyrex Parser
 #

-import os, re
+import os, re, codecs
 from string import join, replace
 from types import ListType, TupleType
 from Scanning import PyrexScanner
@@ -10,6 +10,7 @@ import Nodes
 import ExprNodes
 from ModuleNode import ModuleNode
 from Errors import error, InternalError
+from Cython import Utils

 def p_ident(s, message = "Expected an identifier"):
    if s.sy == 'IDENT':
@@ -281,8 +282,10 @@ def p_call(s, function):
            if not arg.is_name:
                s.error("Expected an identifier before '='",
                    pos = arg.pos)
+            encoded_name = ExprNodes.EncodedString(arg.name)
+            encoded_name.encoding = s.source_encoding
            keyword = ExprNodes.StringNode(arg.pos, 
-                value = arg.name)
+                value = encoded_name)
            arg = p_simple_expr(s)
            keyword_args.append((keyword, arg))
        else:
@@ -459,7 +462,7 @@ def p_atom(s):
        value = s.systring[:-1]
        s.next()
        return ExprNodes.ImagNode(pos, value = value)
-    elif sy == 'STRING' or sy == 'BEGIN_STRING':
+    elif sy == 'BEGIN_STRING':
        kind, value = p_cat_string_literal(s)
        if kind == 'c':
            return ExprNodes.CharNode(pos, value = value)
@@ -500,7 +503,12 @@ def p_name(s, name):
            elif isinstance(value, float):
                return ExprNodes.FloatNode(pos, value = rep)
            elif isinstance(value, str):
-                return ExprNodes.StringNode(pos, value = rep[1:-1])
+                sval = ExprNodes.EncodedString(rep[1:-1])
+                sval.encoding = value.encoding
+                return ExprNodes.StringNode(pos, value = sval)
+            elif isinstance(value, unicode):
+                sval = ExprNodes.EncodedString(rep[2:-1])
+                return ExprNodes.StringNode(pos, value = sval)
            else:
                error(pos, "Invalid type for compile-time constant: %s"
                    % value.__class__.__name__)
@@ -508,21 +516,25 @@ def p_name(s, name):

 def p_cat_string_literal(s):
    # A sequence of one or more adjacent string literals.
-    # Returns (kind, value) where kind in ('', 'c', 'r')
+    # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
    kind, value = p_string_literal(s)
    if kind != 'c':
        strings = [value]
-        while s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+        while s.sy == 'BEGIN_STRING':
            next_kind, next_value = p_string_literal(s)
            if next_kind == 'c':
                self.error(
                    "Cannot concatenate char literal with another string or char literal")
+            elif next_kind == 'u':
+                kind = 'u'
            strings.append(next_value)
-        value = ''.join(strings)
+        value = ExprNodes.EncodedString( u''.join(strings) )
+        if kind != 'u':
+            value.encoding = s.source_encoding
    return kind, value

 def p_opt_string_literal(s):
-    if s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+    if s.sy == 'BEGIN_STRING':
        return p_string_literal(s)
    else:
        return None
@@ -530,10 +542,6 @@ def p_opt_string_literal(s):
 def p_string_literal(s):
    # A single string or char literal.
    # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
-    if s.sy == 'STRING':
-        value = unquote(s.systring)
-        s.next()
-        return value
    # s.sy == 'BEGIN_STRING'
    pos = s.position()
    #is_raw = s.systring[:1].lower() == "r"
@@ -549,8 +557,6 @@ def p_string_literal(s):
            systr = s.systring
            if len(systr) == 1 and systr in "'\"\n":
                chars.append('\\')
-            if kind == 'u' and not isinstance(systr, unicode):
-                systr = systr.decode("UTF-8")
            chars.append(systr)
        elif sy == 'ESCAPE':
            systr = s.systring
@@ -572,7 +578,8 @@ def p_string_literal(s):
                elif c in 'ux':
                    if kind == 'u':
                        try:
-                            chars.append(systr.decode('unicode_escape'))
+                            chars.append(
+                                systr.encode("ASCII").decode('unicode_escape'))
                        except UnicodeDecodeError:
                            s.error("Invalid unicode escape '%s'" % systr,
                                    pos = pos)
@@ -593,50 +600,12 @@ def p_string_literal(s):
                "Unexpected token %r:%r in string literal" %
                    (sy, s.systring))
    s.next()
-    value = ''.join(chars)
+    value = ExprNodes.EncodedString( u''.join(chars) )
+    if kind != 'u':
+        value.encoding = s.source_encoding
    #print "p_string_literal: value =", repr(value) ###
    return kind, value

-def unquote(s):
-    is_raw = 0
-    if s[:1].lower() == "r":
-        is_raw = 1
-        s = s[1:]
-    q = s[:3]
-    if q == '"""' or q == "'''":
-        s = s[3:-3]
-    else:
-        s = s[1:-1]
-    if is_raw:
-        s = s.replace('\\', '\\\\')
-        s = s.replace('\n', '\\\n')
-    else:
-        # Split into double quotes, newlines, escape sequences 
-        # and spans of regular chars
-        l1 = re.split(r'((?:\\[0-7]{1,3})|(?:\\x[0-9A-Fa-f]{2})|(?:\\.)|(?:\\\n)|(?:\n)|")', s)
-        #print "unquote: l1 =", l1 ###
-        l2 = []
-        for item in l1:
-            if item == '"' or item == '\n':
-                l2.append('\\' + item)
-            elif item == '\\\n':
-                pass
-            elif item[:1] == '\\':
-                if len(item) == 2:
-                    if item[1] in '"\\abfnrtv':
-                        l2.append(item)
-                    else:
-                        l2.append(item[1])
-                elif item[1:2] == 'x':
-                    l2.append('\\x0' + item[2:])
-                else:
-                    # octal escape
-                    l2.append(item)
-            else:
-                l2.append(item)
-        s = "".join(l2)
-    return s
-        
 # list_display  	::=  	"[" [listmaker] "]"
 # listmaker 	::= 	expression ( list_for | ( "," expression )* [","] )
 # list_iter 	::= 	list_for | list_if
@@ -946,6 +915,8 @@ def p_import_statement(s):
                    ExprNodes.StringNode(pos, value = "*")])
            else:
                name_list = None
+            dotted_name = ExprNodes.EncodedString(dotted_name)
+            dotted_name.encoding = s.source_encoding
            stat = Nodes.SingleAssignmentNode(pos,
                lhs = ExprNodes.NameNode(pos, 
                    name = as_name or target_name),
@@ -984,14 +955,18 @@ def p_from_import_statement(s):
        imported_name_strings = []
        items = []
        for (name_pos, name, as_name) in imported_names:
+            encoded_name = ExprNodes.EncodedString(name)
+            encoded_name.encoding = s.source_encoding
            imported_name_strings.append(
-                ExprNodes.StringNode(name_pos, value = name))
+                ExprNodes.StringNode(name_pos, value = encoded_name))
            items.append(
                (name,
                 ExprNodes.NameNode(name_pos, 
                 	name = as_name or name)))
        import_list = ExprNodes.ListNode(
            imported_names[0][0], args = imported_name_strings)
+        dotted_name = ExprNodes.EncodedString(dotted_name)
+        dotted_name.encoding = s.source_encoding
        return Nodes.FromImportStatNode(pos,
            module = ExprNodes.ImportNode(dotted_name_pos,
                module_name = ExprNodes.StringNode(dotted_name_pos,
@@ -1204,8 +1179,8 @@ def p_include_statement(s, level):
    if s.compile_time_eval:
        include_file_path = s.context.find_include_file(include_file_name, pos)
        if include_file_path:
-            f = open(include_file_path, "rU")
-            s2 = PyrexScanner(f, include_file_path, s)
+            f = Utils.open_source_file(include_file_path, mode="rU")
+            s2 = PyrexScanner(f, include_file_path, s, source_encoding=f.encoding)
            try:
                tree = p_statement_list(s2, level)
            finally:
@@ -1996,7 +1971,8 @@ def p_class_statement(s):
    # s.sy == 'class'
    pos = s.position()
    s.next()
-    class_name = p_ident(s)
+    class_name = ExprNodes.EncodedString( p_ident(s) )
+    class_name.encoding = s.source_encoding
    if s.sy == '(':
        s.next()
        base_list = p_simple_expr_list(s)
@@ -2113,7 +2089,7 @@ def p_property_decl(s):
    return Nodes.PropertyNode(pos, name = name, doc = doc, body = body)

 def p_doc_string(s):
-    if s.sy == 'STRING' or s.sy == 'BEGIN_STRING':
+    if s.sy == 'BEGIN_STRING':
        _, result = p_cat_string_literal(s)
        if s.sy != 'EOF':
            s.expect_newline("Syntax error in doc string")

--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -37,6 +37,7 @@ class PyrexType(BaseType):
    #  is_enum               boolean     Is a C enum type
    #  is_typedef            boolean     Is a typedef type
    #  is_string             boolean     Is a C char * type
+    #  is_unicode            boolean     Is a UTF-8 encoded C char * type
    #  is_returncode         boolean     Is used only to signal exceptions
    #  is_error              boolean     Is the dummy error type
    #  has_attributes        boolean     Has C dot-selectable attributes
@@ -83,6 +84,7 @@ class PyrexType(BaseType):
    is_enum = 0
    is_typedef = 0
    is_string = 0
+    is_unicode = 0
    is_returncode = 0
    is_error = 0
    has_attributes = 0
@@ -875,19 +877,49 @@ class CEnumType(CType):
            return self.base_declaration_code(public_decl(base, dll_linkage), entity_code)


+def _escape_byte_string(s):
+    try:
+        s.decode("ASCII")
+        return s
+    except UnicodeDecodeError:
+        pass
+    l = []
+    append = l.append
+    for c in s:
+        o = ord(c)
+        if o >= 128:
+            append('\\x%X' % o)
+        else:
+            append(c)
+    return ''.join(l)
+
 class CStringType:
    #  Mixin class for C string types.

    is_string = 1
+    is_unicode = 0
    
    to_py_function = "PyString_FromString"
    from_py_function = "PyString_AsString"
    exception_value = "NULL"

    def literal_code(self, value):
-        if isinstance(value, unicode):
-            value = value.encode("UTF-8")
-        return '"%s"' % value
+        assert isinstance(value, str)
+        return '"%s"' % _escape_byte_string(value)
+
+
+class CUTF8StringType:
+    #  Mixin class for C unicode types.
+
+    is_string = 1
+    is_unicode = 1
+    
+    to_py_function = "PyUnicode_DecodeUTF8"
+    exception_value = "NULL"
+
+    def literal_code(self, value):
+        assert isinstance(value, str)
+        return '"%s"' % _escape_byte_string(value)


 class CCharArrayType(CStringType, CArrayType):
@@ -900,6 +932,16 @@ class CCharArrayType(CStringType, CArrayType):
        CArrayType.__init__(self, c_char_type, size)


+class CUTF8CharArrayType(CUTF8StringType, CArrayType):
+    #  C 'char []' type.
+    
+    parsetuple_format = "s"
+    pymemberdef_typecode = "T_STRING_INPLACE"
+    
+    def __init__(self, size):
+        CArrayType.__init__(self, c_char_type, size)
+    
+
 class CCharPtrType(CStringType, CPtrType):
    # C 'char *' type.
    
@@ -910,6 +952,16 @@ class CCharPtrType(CStringType, CPtrType):
        CPtrType.__init__(self, c_char_type)


+class CUTF8CharPtrType(CUTF8StringType, CPtrType):
+    # C 'char *' type, encoded in UTF-8.
+    
+    parsetuple_format = "s"
+    pymemberdef_typecode = "T_STRING"
+    
+    def __init__(self):
+        CPtrType.__init__(self, c_char_type)
+
+
 class ErrorType(PyrexType):
    # Used to prevent propagation of error messages.
    
@@ -974,7 +1026,9 @@ c_longdouble_type =  CFloatType(8)

 c_null_ptr_type =     CNullPtrType(c_void_type)
 c_char_array_type =   CCharArrayType(None)
+c_utf8_char_array_type =   CUTF8CharArrayType(None)
 c_char_ptr_type =     CCharPtrType()
+c_utf8_char_ptr_type =     CUTF8CharPtrType()
 c_char_ptr_ptr_type = CPtrType(c_char_ptr_type)
 c_int_ptr_type =      CPtrType(c_int_type)


--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -212,7 +212,7 @@ class PyrexScanner(Scanner):
    resword_dict = build_resword_dict()

    def __init__(self, file, filename, parent_scanner = None, 
-            type_names = None, context = None):
+            type_names = None, context = None, source_encoding=None):
        Scanner.__init__(self, get_lexicon(), file, filename)
        if parent_scanner:
            self.context = parent_scanner.context
@@ -226,6 +226,7 @@ class PyrexScanner(Scanner):
            self.compile_time_env = initial_compile_time_env()
            self.compile_time_eval = 1
            self.compile_time_expr = 0
+        self.source_encoding = source_encoding
        self.trace = trace_scanner
        self.indentation_stack = [0]
        self.indentation_char = None

--- a/Cython/Compiler/Symtab.py
+++ b/Cython/Compiler/Symtab.py
@@ -3,8 +3,6 @@
 #

 import re
-import bisect
-
 from Errors import warning, error, InternalError
 import Options
 import Naming
@@ -438,7 +436,13 @@ class Scope:
    def add_string_const(self, value):
        # Add an entry for a string constant.
        cname = self.new_const_cname()
-        entry = Entry("", cname, c_char_array_type, init = value)
+        if value.is_unicode:
+            c_type = c_utf8_char_array_type
+            value = value.utf8encode()
+        else:
+            c_type = c_char_array_type
+            value = value.byteencode()
+        entry = Entry("", cname, c_type, init = value)
        entry.used = 1
        self.const_entries.append(entry)
        return entry
@@ -460,7 +464,7 @@ class Scope:
        # Python identifier, it will be interned.
        if not entry.pystring_cname:
            value = entry.init
-            if identifier_pattern.match(value) and isinstance(value, str):
+            if not entry.type.is_unicode and identifier_pattern.match(value):
                entry.pystring_cname = self.intern(value)
                entry.is_interned = 1
            else:

--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -3,7 +3,7 @@
 #            anywhere else in particular
 #

-import os, sys
+import os, sys, re, codecs

 def replace_suffix(path, newsuf):
    base, _ = os.path.splitext(path)
@@ -32,3 +32,25 @@ def castrate_file(path, st):
        f.close()
        if st:
            os.utime(path, (st.st_atime, st.st_mtime))
+
+# support for source file encoding detection and unicode decoding
+
+_match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
+
+def detect_file_encoding(source_filename):
+    # PEPs 263 and 3120
+    f = codecs.open(source_filename, "rU", encoding="UTF-8")
+    try:
+        for line_no, line in enumerate(f):
+            encoding = _match_file_encoding(line)
+            if encoding:
+                return encoding.group(1)
+            if line_no == 1:
+                break
+    finally:
+        f.close()
+    return "UTF-8"
+
+def open_source_file(source_filename, mode="rU"):
+    encoding = detect_file_encoding(source_filename)
+    return codecs.open(source_filename, mode=mode, encoding=encoding)
--- a/Demos/Setup.py
+++ b/Demos/Setup.py
+import glob
+
 from distutils.core import setup
 from distutils.extension import Extension
 from Cython.Distutils import build_ext

-setup(
-  name = 'Demos',
-  ext_modules=[ 
+ext_modules=[ 
    Extension("primes",       ["primes.pyx"]),
    Extension("spam",         ["spam.pyx"]),
-#    Extension("numeric_demo", ["numeric_demo.pyx"]),
-    Extension("test", ["test.pyx"]),
-    Extension("func_pointers", ["func_pointers.pyx"]),
-#    Extension("inplace", ["inplace.pyx"]),
-#    Extension("withGIL", ["withGIL.pyx"]),
-    Extension("class_members", ["class_members.pyx"]),
-#    Extension("inherit_bug", ["inherit_bug.pyx"]),
-    Extension("override", ["override.pyx"]),
-    Extension("cond", ["cond.pyx"]),
-#    Extension("submodule.test",       ["submodule/test.pyx"]),
-    Extension("errors",       ["errors.pyx"]),
-    Extension("cpdef",       ["cpdef.pyx"]),
-    Extension("range",       ["range.pyx"]),
-    Extension("early_temps",       ["early_temps.pyx"]),
-    Extension("ints",       ["ints.pyx"]),
-    Extension("clear",       ["clear.pyx"]),
-    Extension("detect_override",       ["detect_override.pyx"]),
-    Extension("fixes",       ["fixes.pyx"]),
-    ],
+]
+
+for file in glob.glob("*.pyx"):
+    if file != "numeric_demo.pyx":
+        ext_modules.append(Extension(file[:-4], [file]))
+
+setup(
+  name = 'Demos',
  cmdclass = {'build_ext': build_ext},
-#  include_dirs = "/System/Library/Frameworks/Python.framework/Versions/2.3/include/python2.3/"
+  ext_modules = ext_modules,
 )
--- a/Includes/python.pxi
+++ b/Includes/python.pxi
--- a/Includes/python2.5.pxi
+++ b/Includes/python2.5.pxi
--- a/Includes/python_bool.pxi
+++ b/Includes/python_bool.pxi
--- a/Includes/python_complex.pxi
+++ b/Includes/python_complex.pxi
--- a/Includes/python_dict.pxi
+++ b/Includes/python_dict.pxi
--- a/Includes/python_exc.pxi
+++ b/Includes/python_exc.pxi
--- a/Includes/python_float.pxi
+++ b/Includes/python_float.pxi
--- a/Includes/python_function.pxi
+++ b/Includes/python_function.pxi
--- a/Includes/python_instance.pxi
+++ b/Includes/python_instance.pxi
--- a/Includes/python_int.pxi
+++ b/Includes/python_int.pxi
--- a/Includes/python_iterator.pxi
+++ b/Includes/python_iterator.pxi
--- a/Includes/python_list.pxi
+++ b/Includes/python_list.pxi
--- a/Includes/python_long.pxi
+++ b/Includes/python_long.pxi
--- a/Includes/python_mapping.pxi
+++ b/Includes/python_mapping.pxi
--- a/Includes/python_mem.pxi
+++ b/Includes/python_mem.pxi
--- a/Includes/python_method.pxi
+++ b/Includes/python_method.pxi
--- a/Includes/python_module.pxi
+++ b/Includes/python_module.pxi
--- a/Includes/python_number.pxi
+++ b/Includes/python_number.pxi
--- a/Includes/python_object.pxi
+++ b/Includes/python_object.pxi
--- a/Includes/python_parse.pxi
+++ b/Includes/python_parse.pxi
--- a/Includes/python_ref.pxi
+++ b/Includes/python_ref.pxi
--- a/Includes/python_sequence.pxi
+++ b/Includes/python_sequence.pxi
--- a/Includes/python_set.pxi
+++ b/Includes/python_set.pxi
--- a/Includes/python_string.pxi
+++ b/Includes/python_string.pxi
--- a/Includes/python_tuple.pxi
+++ b/Includes/python_tuple.pxi
--- a/Includes/python_type.pxi
+++ b/Includes/python_type.pxi
--- a/Includes/stdio.pxi
+++ b/Includes/stdio.pxi
--- a/Includes/stdlib.pxi
+++ b/Includes/stdlib.pxi
--- a/runtests.py
+++ b/runtests.py
@@ -2,20 +2,16 @@

 import os, sys, unittest, doctest

-#from Cython.Distutils.build_ext import build_ext
-#from Cython.Distutils.extension import Extension
-
-from distutils.extension import Extension
+from Cython.Distutils.extension import Extension
 from Cython.Distutils import build_ext

-
 from distutils.dist import Distribution
 distutils_distro = Distribution()

 TEST_DIRS = ['compile', 'run']
 TEST_RUN_DIRS = ['run']

-INCLUDE_DIRS = os.getenv('INCLUDE', '').split(os.pathsep)
+INCLUDE_DIRS = [ d for d in os.getenv('INCLUDE', '').split(os.pathsep) if d ]
 CFLAGS = os.getenv('CFLAGS', '').split()

 class TestBuilder(object):

--- a/tests/compile/crunchytype.h
+++ b/tests/compile/crunchytype.h
+
+struct CrunchyType {
+  int number;
+  PyObject* string;
+};
--- a/tests/compile/crunchytype.pxd
+++ b/tests/compile/crunchytype.pxd
+cdef extern from "crunchytype.h":
+    cdef class crunchytype.Crunchy [ object CrunchyType ]:
+        cdef int number
+        cdef object string
--- a/tests/compile/extimportedsubtype.pyx
+++ b/tests/compile/extimportedsubtype.pyx
+from crunchytype cimport Crunchy
+
+cdef class Sub2(Crunchy):
+    cdef char character
+
+cdef class Sub1(Sub2):
+    cdef char character
--- a/tests/run/addop.pyx
+++ b/tests/run/addop.pyx
 __doc__ = """
-    >>> 
+    >>> f()
+    (30, 22)
 """

 def f():
    cdef int int1, int2, int3
-    cdef char *ptr1, *ptr2, *ptr3
+    cdef char *ptr1, *ptr2 = "test", *ptr3 = "toast"
    int2 = 10
    int3 = 20
    obj1 = 1

--- a/tests/run/cintop.pyx
+++ b/tests/run/cintop.pyx
 __doc__ = """
+    >>> int2 = 42
+    >>> int3 = 7
+    >>> char1 = ord('C')
+
+    >>> int1 = int2 | int3
+    >>> int1 |= int2 ^ int3
+    >>> int1 ^= int2 & int3
+    >>> int1 ^= int2 << int3
+    >>> int1 ^= int2 >> int3
+    >>> int1 ^= int2 << int3 | int2 >> int3
+    >>> long1 = char1 | int1
+    >>> print (int1, long1) == f()
+    True
+
    >>> f()
-    (5376, 67)
+    (45, 111)
 """

 def f():
@@ -12,10 +26,10 @@ def f():
    char1 = c'C'

    int1 = int2 | int3
-    int1 = int2 ^ int3
-    int1 = int2 & int3
-    int1 = int2 << int3
-    int1 = int2 >> int3
-    int1 = int2 << int3 | int2 >> int3
-    long1 = char1 | long2
+    int1 |= int2 ^ int3
+    int1 ^= int2 & int3
+    int1 ^= int2 << int3
+    int1 ^= int2 >> int3
+    int1 ^= int2 << int3 | int2 >> int3
+    long1 = char1 | int1
    return int1, long1
--- a/tests/run/ct_DEF.pyx
+++ b/tests/run/ct_DEF.pyx
 __doc__ = """
    >>> c()
    120
-    >>> i()
-    42
+    >>> i1() == 42
+    True
+    >>> i2() == 0x42
+    True
+    >>> i3() == 042
+    True
    >>> l()
    666
    >>> f()
@@ -23,7 +27,9 @@ DEF TUPLE = (1, 2, "buckle my shoe")
 DEF TRUE_FALSE = (True, False)

 DEF CHAR = c'x'
-DEF INT = 42
+DEF INT1 = 42
+DEF INT2 = 0x42
+DEF INT3 = 042
 DEF LONG = 666L
 DEF FLOAT = 12.5
 DEF STR = "spam"
@@ -37,9 +43,19 @@ def c():
    c = CHAR
    return c

-def i():
+def i1():
+    cdef int i
+    i = INT1
+    return i
+
+def i2():
+    cdef int i
+    i = INT2
+    return i
+
+def i3():
    cdef int i
-    i = INT
+    i = INT3
    return i

 def l():

--- a/tests/run/include.pyx
+++ b/tests/run/include.pyx
+__doc__ = """
+    >>> D
+    2
+"""
+
+D = 1
+
+include "testinclude.pxi"
--- a/tests/run/pyextattrref.pyx
+++ b/tests/run/pyextattrref.pyx
 __doc__ = """
-    >>> 
+    >>> s = Spam(Eggs("ham"))
+    >>> test(s)
+    'ham'
 """

 cdef class Eggs:
    cdef object ham
+    def __init__(self, ham):
+        self.ham = ham

 cdef class Spam:
    cdef Eggs eggs
+    def __init__(self, eggs):
+        self.eggs = eggs

-cdef void tomato(Spam s):
+cdef object tomato(Spam s):
    food = s.eggs.ham
+    return food
+
+def test(Spam s):
+    return tomato(s)
--- a/tests/run/r_docstrings.pyx
+++ b/tests/run/r_docstrings.pyx
 __doc__ = """
-    >>> print f.__doc__
-    This is a function docstring.
-    >>> print C.__doc__
-    This is a class docstring.
-    >>> print T.__doc__
-    This is an extension type docstring.
+    >>> f.__doc__
+    'This is a function docstring.'
+
+    >>> C.__doc__
+    'This is a class docstring.'
+    >>> CS.__doc__
+    'This is a subclass docstring.'
+    >>> print CSS.__doc__
+    None
+
+    >>> T.__doc__
+    'This is an extension type docstring.'
+    >>> TS.__doc__
+    'This is an extension subtype docstring.'
+    >>> TSS.__doc__
+
+Compare with standard Python:
+
+    >>> def f():
+    ...     'This is a function docstring.'
+    >>> f.__doc__
+    'This is a function docstring.'
+
+    >>> class C:
+    ...     'This is a class docstring.'
+    >>> class CS(C):
+    ...     'This is a subclass docstring.'
+    >>> class CSS(CS):
+    ...     pass
+
+    >>> C.__doc__
+    'This is a class docstring.'
+    >>> CS.__doc__
+    'This is a subclass docstring.'
+    >>> CSS.__doc__
 """

 def f():
@@ -13,6 +42,17 @@ def f():
 class C:
    "This is a class docstring."

+class CS(C):
+    "This is a subclass docstring."
+
+class CSS(CS):
+    pass
+
 cdef class T:
    "This is an extension type docstring."

+cdef class TS(T):
+    "This is an extension subtype docstring."
+
+cdef class TSS(TS):
+    pass
--- a/tests/run/specialfloat.pyx
+++ b/tests/run/specialfloat.pyx
+__doc__ = """
+    >>> f()
+    12.5
+
+    >>> nan1()
+    nan
+    >>> nan2()
+    nan
+    >>> nan3()
+    nan
+    >>> float_nan
+    nan
+
+    >>> infp1()
+    inf
+    >>> infp1() == float('inf')
+    True
+    >>> infp2()
+    inf
+    >>> infp2() == float('inf')
+    True
+    >>> infp3()
+    inf
+    >>> infp3() == float('inf')
+    True
+    >>> float_infp
+    inf
+    >>> float_infp == float('inf')
+    True
+
+    >>> infn1()
+    -inf
+    >>> infn1() == float('-inf')
+    True
+    >>> infn2()
+    -inf
+    >>> infn2() == float('-inf')
+    True
+    >>> infn3()
+    -inf
+    >>> infn3() == float('-inf')
+    True
+    >>> float_infn
+    -inf
+    >>> float_infn == float('-inf')
+    True
+"""
+
+DEF FLOAT = 12.5
+DEF FLOAT_NAN = float('nan')
+DEF FLOAT_INFP = float('+inf')
+DEF FLOAT_INFN = float('-inf')
+
+float_nan = FLOAT_NAN
+float_infp = FLOAT_INFP
+float_infn = FLOAT_INFN
+
+def f():
+    cdef float f
+    f = FLOAT
+    return f
+
+def nan1():
+    cdef double f
+    f = FLOAT_NAN
+    return f
+
+def nan2():
+    cdef double f
+    f = float('nan')
+    return f
+
+def nan3():
+    cdef float f
+    f = FLOAT_NAN
+    return f
+
+def infp1():
+    cdef double f
+    f = FLOAT_INFP
+    return f
+
+def infp2():
+    cdef double f
+    f = float('+inf')
+    return f
+
+def infp3():
+    cdef float f
+    f = FLOAT_INFP
+    return f
+
+def infn1():
+    cdef double f
+    f = FLOAT_INFN
+    return f
+
+def infn2():
+    cdef double f
+    f = float('-inf')
+    return f
+
+def infn3():
+    cdef float f
+    f = FLOAT_INFN
+    return f
+
--- a/tests/run/testinclude.pxi
+++ b/tests/run/testinclude.pxi
+D = 2
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -49,13 +49,17 @@ __doc__ = r"""
    True
    >>> d == u'üÖä'
    True
-    >>> e == u'\x03\x67\xf8\uf8d2Søk ik'
+    >>> e == u'\x03\x67\xf8\uf8d2Søk ik'     # unescaped by Cython
    True
-    >>> f == u'\xf8'
+    >>> e == u'\\x03\\x67\\xf8\\uf8d2Søk ik' # unescaped by Python
+    True
+    >>> f == u'\xf8'  # unescaped by Cython
+    True
+    >>> f == u'\\xf8' # unescaped by Python
    True
    >>> add == u'Søk ik' + u'üÖä' + 'abc'
    True
-    >>> null == u'\\x00' # doctest needs a double slash here
+    >>> null == u'\\x00' # unescaped by Python (required by doctest)
    True
 """