Faster dependancy parsing.

Now based on regex, as actually parsing the file was way to slow.

Faster dependancy parsing.
Now based on regex, as actually parsing the file was way to slow.
03199009 · Robert Bradshaw · 57cdf36e · 03199009 · 03199009
Commit 03199009 authored Sep 12, 2010 by Robert Bradshaw
Hide whitespace changes
Inline Side-by-side

Showing with 200 additions and 82 deletions

Cython/Compiler/Dependencies.py Cython/Compiler/Dependencies.py +200 -0

Cython/Compiler/Main.py Cython/Compiler/Main.py +0 -82

No files found.
--- a/Cython/Compiler/Dependencies.py
+++ b/Cython/Compiler/Dependencies.py
+from Cython import Utils
+import re
+
+def cached_method(f):
+    cache_name = '__%s_cache' % f.__name__
+    def wrapper(self, *args):
+        cache = getattr(self, cache_name, None)
+        if cache is None:
+            cache = {}
+            setattr(self, cache_name, cache)
+        if args in cache:
+            return cache[args]
+        res = cache[args] = f(self, *args)
+        return res
+    return wrapper
+
+def strip_string_literals(code, prefix='__Pyx_L'):
+    """
+    Normalizes every string literal to be of the form '__Pyx_Lxxx', 
+    returning the normalized code and a mapping of labels to
+    string literals. 
+    """
+    new_code = []
+    literals = {}
+    counter = 0
+    start = q = 0
+    in_quote = False
+    raw = False
+    while True:
+        single_q = code.find("'", q)
+        double_q = code.find('"', q)
+        q = min(single_q, double_q)
+        if q == -1: q = max(single_q, double_q)
+        if q == -1:
+            if in_quote:
+                counter += 1
+                label = "'%s%s" % (prefix, counter)
+                literals[label] = code[start:]
+                new_code.append(label)
+            else:
+                new_code.append(code[start:])
+            break
+        elif in_quote:
+            if code[q-1] == '\\':
+                k = 2
+                while q >= k and code[q-k] == '\\':
+                    k += 1
+                if k % 2 == 0:
+                    q += 1
+                    continue
+            if code[q:q+len(in_quote)] == in_quote:
+                counter += 1
+                label = "%s%s" % (prefix, counter)
+                literals[label] = code[start+len(in_quote):q]
+                new_code.append("'%s'" % label)
+                q += len(in_quote)
+                start = q
+                in_quote = False
+            else:
+                q += 1
+        else:
+            raw = False
+            if len(code) >= q+3 and (code[q+1] == code[q] == code[q+2]):
+                in_quote = code[q]*3
+            else:
+                in_quote = code[q]
+            end = q
+            while end>0 and code[end-1] in 'rRbBuU':
+                if code[end-1] in 'rR':
+                    raw = True
+                end -= 1
+            new_code.append(code[start:end])
+            start = q
+            q += len(in_quote)
+    
+    return "".join(new_code), literals
+
+def parse_dependencies(source_filename):
+    # Actual parsing is way to slow, so we use regular expressions.
+    # The only catch is that we must strip comments and string
+    # literals ahead of time.
+    source = Utils.open_source_file(source_filename, "rU").read()
+    source = re.sub('#.*', '', source)
+    source, literals = strip_string_literals(source)
+    source = source.replace('\\\n', ' ')
+    if '\t' in source:
+        source = source.replace('\t', ' ')
+    # TODO: pure mode
+    dependancy = re.compile(r"(cimport +([0-9a-zA-Z_.]+)\b)|(from +([0-9a-zA-Z_.]+) +cimport)|(include +'([^']+)')|(cdef +extern +from +'([^']+)')")
+    cimports = []
+    includes = []
+    externs  = []
+    for m in dependancy.finditer(source):
+        groups = m.groups()
+        if groups[0]:
+            cimports.append(groups[1])
+        elif groups[2]:
+            cimports.append(groups[3])
+        elif groups[4]:
+            includes.append(literals[groups[5]])
+        else:
+            externs.append(literals[groups[7]])
+    return cimports, includes, externs
+
+
+class DependencyTree(object):
+    
+    def __init__(self, context):
+        self.context = context
+        self._transitive_cache = {}
+    
+    @cached_method
+    def parse_dependencies(self, source_filename):
+        return parse_dependencies(source_filename)
+    
+    @cached_method
+    def immediate_dependencies(self, filename):
+        cimports, includes, externs = self.parse_dependencies(filename)
+        cimports = set(cimports)
+        externs = set(externs)
+        for include in includes:
+            a, b = self.immediate_dependencies(os.path.join(os.path.dirname(filename), include))
+            cimports.update(a)
+            externs.update(b)
+        return cimports, externs
+    
+    @cached_method
+    def package(self, filename):
+        dir = os.path.dirname(filename)
+        if os.path.exists(os.path.join(dir, '__init__.py')):
+            return self.package(dir) + [os.path.basename(dir)]
+        else:
+            return []
+    
+    def find_pxd(self, module, filename=None):
+        if module[0] == '.':
+            raise NotImplementedError, "New relative imports."
+        if filename is not None:
+            relative = '.'.join(self.package(filename) + module.split('.'))
+            pxd = self.context.find_pxd_file(relative, None)
+            if pxd:
+                return pxd
+        return self.context.find_pxd_file(module, None)
+        
+    @cached_method
+    def cimported_files(self, filename):
+        if filename[-4:] == '.pyx' and os.path.exists(filename[:-4] + '.pxd'):
+            self_pxd = [filename[:-4] + '.pxd']
+        else:
+            self_pxd = []
+        a = self.immediate_dependencies(filename)[0]
+        b = filter(None, [self.find_pxd(m, filename) for m in self.immediate_dependencies(filename)[0]])
+        if len(a) != len(b):
+            print (filename)
+            print ("\n\t".join(a))
+            print ("\n\t".join(b))
+        return self_pxd + filter(None, [self.find_pxd(m, filename) for m in self.immediate_dependencies(filename)[0]])
+    
+    @cached_method
+    def timestamp(self, filename):
+        return os.path.getmtime(filename)
+    
+    def extract_timestamp(self, filename):
+        return self.timestamp(filename), filename
+    
+    def newest_dependency(self, filename):
+        return self.transitive_merge(filename, self.extract_timestamp, max)
+    
+    def transitive_merge(self, node, extract, merge):
+        try:
+            seen = self._transitive_cache[extract, merge]
+        except KeyError:
+            seen = self._transitive_cache[extract, merge] = {}
+        return self.transitive_merge_helper(
+            node, extract, merge, seen, {}, self.cimported_files)[0]
+    
+    def transitive_merge_helper(self, node, extract, merge, seen, stack, outgoing):
+        if node in seen:
+            return seen[node], None
+        deps = extract(node)
+        if node in stack:
+            return deps, node
+        try:
+            stack[node] = len(stack)
+            loop = None
+            for next in outgoing(node):
+                sub_deps, sub_loop = self.transitive_merge_helper(next, extract, merge, seen, stack, outgoing)
+                if sub_loop is not None:
+                    if loop is not None and stack[loop] < stack[sub_loop]:
+                        pass
+                    else:
+                        loop = sub_loop
+                deps = merge(deps, sub_deps)
+            if loop == node:
+                loop = None
+            if loop is None:
+                seen[node] = deps
+            return deps, loop
+        finally:
+            del stack[node]
--- a/Cython/Compiler/Main.py
+++ b/Cython/Compiler/Main.py
@@ -583,88 +583,6 @@ def run_pipeline(source, options, full_module_name = None):
    err, enddata = context.run_pipeline(pipeline, source)
    context.teardown_errors(err, options, result)
    return result
-
-def parse_dependancies(source):
-
-    # Set up context
-    options = CompilationOptions()
-    context = Context(options.include_path, options.compiler_directives,
-                      options.cplus, options.language_level)
-
-    # Set up source object
-    cwd = os.getcwd()
-    source_desc = FileSourceDescriptor(os.path.join(cwd, source))
-    full_module_name = context.extract_module_name(source, options)
-    source = CompilationSource(source_desc, full_module_name, cwd)
-
-    scope = ModuleScope(name = full_module_name, parent_module = None, context = context)
-    f = Utils.open_source_file(source_desc.filename, "rU")
-    s = PyrexScanner(f, source_desc, source_encoding = f.encoding,
-                     scope = scope, context = context)
-    
-    from Parsing import p_string_literal
-    
-    def p_all(s):
-        while s.sy != 'EOF':
-            if s.sy == 'cimport':
-                p_cimport(s)
-            elif s.sy == 'from':
-                p_from(s)
-            elif s.sy == 'include':
-                p_include(s)
-            elif s.sy == 'cdef':
-                p_cdef_extern(s)
-            else:
-                p_other(s)
-        
-    def p_include(s):
-        s.next()
-        _, include_file = p_string_literal(s)
-        includes.append(include_file)
-    
-    def p_dotted_name(s):
-        path = [s.systring]
-        s.next()
-        while s.sy == '.':
-            s.next()
-            path.append(s.systring)
-            s.next()
-        return '.'.join(path)
-    
-    def p_cimport(s):
-        # TODO: newstyle relative imports
-        s.next()
-        cimports.append(p_dotted_name(s))
-    
-    def p_from(s):
-        s.next()
-        module = p_dotted_name(s)
-        if s.systring == 'cimport':
-            cimports.append(module)
-        p_other(s)
-    
-    def p_cdef_extern(s):
-        s.next()
-        if s.systring != 'extern': return
-        s.next()
-        if s.systring != 'from': return
-        s.next()
-        if s.systring == '*': return
-        _, include_file = p_string_literal(s)
-        externs.append(include_file)
-        
-    def p_other(s):
-        while s.sy not in ('NEWLINE', 'EOF'):
-            s.next()
-        if s.sy == 'NEWLINE':
-            s.next()
-    
-    cimports = []
-    includes = []
-    externs = []
-    p_all(s)
-    
-    print cimports, includes, externs
    

 #------------------------------------------------------------------------