Moved gencodec.py to the Tools/unicode/ directory.

Added new support for decoding tables. Cleaned up the implementation a bit.

Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables. Cleaned up the implementation a bit.
c5694c8b · Marc-André Lemburg · 31441302 · c5694c8b
Commit c5694c8b authored Oct 21, 2005 by Marc-André Lemburg
Show whitespace changes
Inline Side-by-side

Showing with 179 additions and 88 deletions

Tools/unicode/gencodec.py Tools/unicode/gencodec.py +179 -88

No files found.
--- a/Tools/scripts/gencodec.py
+++ b/Tools/scripts/gencodec.py
@@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
 The tool also writes marshalled versions of the mapping tables to the
 same location (with .mapping extension).

-Written by Marc-Andre Lemburg (mal@lemburg.com).
+Written by Marc-Andre Lemburg (mal@lemburg.com).  Modified to generate
+Unicode table maps for decoding.

 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 (c) Copyright Guido van Rossum, 2000.
+(c) Copyright Marc-Andre Lemburg, 2005.

 """#"

-import re,os,time,marshal
+import re, os, time, marshal, codecs

-# Create numeric tables or character based ones ?
-numeric = 1
+# Maximum allowed size of charmap tables
+MAX_TABLE_SIZE = 8192
+
+# Standard undefined Unicode code point
+UNI_UNDEFINED = unichr(0xFFFE)

 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
                   '\s+'
@@ -69,8 +74,15 @@ def readmap(filename):
    enc2uni = {}
    identity = []
    unmapped = range(256)
-    for i in range(256):
-        unmapped[i] = i
+
+    # UTC mapping tables per convention don't include the identity
+    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
+    # explicitly mapped to different characters or undefined
+    for i in range(32) + [127]:
+        identity.append(i)
+        unmapped.remove(i)
+        enc2uni[i] = (i, 'CONTROL CHARACTER')
+
    for line in lines:
        line = line.strip()
        if not line or line[0] == '#':
@@ -82,22 +94,23 @@ def readmap(filename):
        enc,uni,comment = m.groups()
        enc = parsecodes(enc)
        uni = parsecodes(uni)
-        if not comment:
+        if comment is None:
            comment = ''
        else:
-            comment = comment[1:]
+            comment = comment[1:].strip()
        if enc < 256:
+            if enc in unmapped:
                unmapped.remove(enc)
            if enc == uni:
                identity.append(enc)
-            else:
            enc2uni[enc] = (uni,comment)
        else:
            enc2uni[enc] = (uni,comment)
+
    # If there are more identity-mapped entries than unmapped entries,
    # it pays to generate an identity dictionary first, and add explicit
    # mappings to None for the rest
-    if len(identity)>=len(unmapped):
+    if len(identity) >= len(unmapped):
        for enc in unmapped:
            enc2uni[enc] = (None, "")
        enc2uni['IDENTITY'] = 256
@@ -112,44 +125,146 @@ def hexrepr(t):
        len(t)
    except:
        return '0x%04x' % t
+    try:
        return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
+    except TypeError, why:
+        print '* failed to convert %r: %s' % (t, why)
+        raise

-def unicoderepr(t):
+def python_mapdef_code(varname, map, comments=1):

-    if t is None:
-        return 'None'
-    if numeric:
-        return hexrepr(t)
+    l = []
+    append = l.append
+    if map.has_key("IDENTITY"):
+        append("%s = codecs.make_identity_dict(range(%d))" %
+               (varname, map["IDENTITY"]))
+        append("%s.update({" % varname)
+        splits = 1
+        del map["IDENTITY"]
+        identity = 1
    else:
-        try:
-            len(t)
-        except:
-            return repr(unichr(t))
-        return repr(''.join(map(unichr, t)))
+        append("%s = {" % varname)
+        splits = 0
+        identity = 0
+
+    mappings = map.items()
+    mappings.sort()
+    i = 0
+    for mapkey, mapvalue in mappings:
+        mapcomment = ''
+        if isinstance(mapkey, tuple):
+            (mapkey, mapcomment) = mapkey
+        if isinstance(mapvalue, tuple):
+            (mapvalue, mapcomment) = mapvalue
+        if mapkey is None:
+            continue
+        if (identity and
+            mapkey == mapvalue and
+            mapkey < 256):
+            # No need to include identity mappings, since these
+            # are already set for the first 256 code points.
+            continue
+        key = hexrepr(mapkey)
+        value = hexrepr(mapvalue)
+        if mapcomment and comments:
+            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
+        else:
+            append('    %s: %s,' % (key, value))
+        i += 1
+        if i == 4096:
+            # Split the definition into parts to that the Python
+            # parser doesn't dump core
+            if splits == 0:
+                append('}')
+            else:
+                append('})')
+            append('%s.update({' % varname)
+            i = 0
+            splits = splits + 1
+    if splits == 0:
+        append('}')
+    else:
+        append('})')

-def keyrepr(t):
+    return l

-    if t is None:
-        return 'None'
-    if numeric:
-        return hexrepr(t)
+def python_tabledef_code(varname, map, comments=1):
+
+    l = []
+    append = l.append
+    append('%s = (' % varname)
+
+    # Analyze map and create table dict
+    mappings = map.items()
+    mappings.sort()
+    table = {}
+    maxkey = 0
+    if map.has_key('IDENTITY'):
+        for key in range(256):
+            table[key] = (key, '')
+        maxkey = 255
+        del map['IDENTITY']
+    for mapkey, mapvalue in mappings:
+        mapcomment = ''
+        if isinstance(mapkey, tuple):
+            (mapkey, mapcomment) = mapkey
+        if isinstance(mapvalue, tuple):
+            (mapvalue, mapcomment) = mapvalue
+        if mapkey is None:
+            continue
+        table[mapkey] = (mapvalue, mapcomment)
+        if mapkey > maxkey:
+            maxkey = mapkey
+    if maxkey > MAX_TABLE_SIZE:
+        # Table too large
+        return None
+
+    # Create table code
+    for key in range(maxkey + 1):
+        if key not in table:
+            mapvalue = None
+            mapcomment = 'UNDEFINED'
        else:
-        try:
-            len(t)
-        except:
-            if t < 256:
-                return repr(chr(t))
+            mapvalue, mapcomment = table[key]
+        if mapvalue is None:
+            mapchar = UNI_UNDEFINED
+        else:
+            if isinstance(mapvalue, tuple):
+                # 1-n mappings not supported
+                return None
            else:
-                return repr(unichr(t))
-        return repr(''.join(map(chr, t)))
+                mapchar = unichr(mapvalue)
+        if mapcomment and comments:
+            append('    %r\t#  %s -> %s' % (mapchar,
+                                            hexrepr(key),
+                                            mapcomment))
+        else:
+            append('    %r' % mapchar)
+
+    append(')')
+    return l

-def codegen(name,map,comments=1):
+def codegen(name, map, comments=1):

    """ Returns Python source for the given map.

        Comments are included in the source, if comments is true (default).

    """
+    # Generate code
+    decoding_map_code = python_mapdef_code(
+        'decoding_map',
+        map,
+        comments=comments)
+    decoding_table_code = python_tabledef_code(
+        'decoding_table',
+        map,
+        comments=comments)
+    encoding_map_code = python_mapdef_code(
+        'encoding_map',
+        codecs.make_encoding_map(map),
+        comments=comments)
+
    l = [
        '''\
 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
@@ -167,9 +282,16 @@ class Codec(codecs.Codec):
        return codecs.charmap_encode(input,errors,encoding_map)

    def decode(self,input,errors='strict'):
+''' % name
+        ]
+    if decoding_table_code:
+        l.append('''\
+        return codecs.charmap_decode(input,errors,decoding_table)''')
+    else:
+        l.append('''\
+        return codecs.charmap_decode(input,errors,decoding_map)''')
         
-        return codecs.charmap_decode(input,errors,decoding_map)
-
+    l.append('''    
 class StreamWriter(Codec,codecs.StreamWriter):
    pass

@@ -183,54 +305,21 @@ def getregentry():
    return (Codec().encode,Codec().decode,StreamReader,StreamWriter)

 ### Decoding Map
-''' % name,
-        ]
+''')
+    l.extend(decoding_map_code)

-    if map.has_key("IDENTITY"):
-        l.append("decoding_map = codecs.make_identity_dict(range(%d))"
-                 % map["IDENTITY"])
-        l.append("decoding_map.update({")
-        splits = 1
-        del map["IDENTITY"]
-    else:
-        l.append("decoding_map = {")
-        splits = 0
+    # Add optional decoding table
+    if decoding_table_code:
+        l.append('''
+### Decoding Table
+''')
+        l.extend(decoding_table_code)

-    mappings = map.items()
-    mappings.sort()
-    append = l.append
-    i = 0
-    for e,value in mappings:
-        try:
-            (u,c) = value
-        except TypeError:
-            u = value
-            c = ''
-        key = keyrepr(e)
-        if c and comments:
-            append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
-        else:
-            append('\t%s: %s,' % (key,unicoderepr(u)))
-        i += 1
-        if i == 4096:
-            # Split the definition into parts to that the Python
-            # parser doesn't dump core
-            if splits == 0:
-                append('}')
-            else:
-                append('})')
-            append('decoding_map.update({')
-            i = 0
-            splits = splits + 1
-    if splits == 0:
-        append('}')
-    else:
-        append('})')
-    append('''
+    l.append('''
 ### Encoding Map
-
-encoding_map = codecs.make_encoding_map(decoding_map)
 ''')
+    l.extend(encoding_map_code)
+    
    return '\n'.join(l)

 def pymap(name,map,pyfile,comments=1):
@@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):

    mapnames = os.listdir(dir)
    for mapname in mapnames:
+        mappathname = os.path.join(dir, mapname)
        name = os.path.split(mapname)[1]
        name = name.replace('-','_')
        name = name.split('.')[0]
@@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
            if not map:
                print '* map is empty; skipping'
            else:
-                pymap(mapname, map, prefix + codefile,comments)
-                marshalmap(mapname, map, prefix + marshalfile)
-        except ValueError:
-            print '* conversion failed'
+                pymap(mappathname, map, prefix + codefile,comments)
+                marshalmap(mappathname, map, prefix + marshalfile)
+        except ValueError, why:
+            print '* conversion failed: %s' % why
+            raise

 def rewritepythondir(dir,prefix='',comments=1):