Commit c5694c8b authored by Marc-André Lemburg's avatar Marc-André Lemburg

Moved gencodec.py to the Tools/unicode/ directory.

Added new support for decoding tables.

Cleaned up the implementation a bit.
parent 31441302
......@@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
The tool also writes marshalled versions of the mapping tables to the
same location (with .mapping extension).
Written by Marc-Andre Lemburg (mal@lemburg.com).
Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
Unicode table maps for decoding.
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright Guido van Rossum, 2000.
(c) Copyright Marc-Andre Lemburg, 2005.
"""#"
import re,os,time,marshal
import re, os, time, marshal, codecs
# Create numeric tables or character based ones ?
numeric = 1
# Maximum allowed size of charmap tables
MAX_TABLE_SIZE = 8192
# Standard undefined Unicode code point
UNI_UNDEFINED = unichr(0xFFFE)
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
'\s+'
......@@ -69,8 +74,15 @@ def readmap(filename):
enc2uni = {}
identity = []
unmapped = range(256)
for i in range(256):
unmapped[i] = i
# UTC mapping tables per convention don't include the identity
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
# explicitly mapped to different characters or undefined
for i in range(32) + [127]:
identity.append(i)
unmapped.remove(i)
enc2uni[i] = (i, 'CONTROL CHARACTER')
for line in lines:
line = line.strip()
if not line or line[0] == '#':
......@@ -82,22 +94,23 @@ def readmap(filename):
enc,uni,comment = m.groups()
enc = parsecodes(enc)
uni = parsecodes(uni)
if not comment:
if comment is None:
comment = ''
else:
comment = comment[1:]
comment = comment[1:].strip()
if enc < 256:
if enc in unmapped:
unmapped.remove(enc)
if enc == uni:
identity.append(enc)
else:
enc2uni[enc] = (uni,comment)
else:
enc2uni[enc] = (uni,comment)
# If there are more identity-mapped entries than unmapped entries,
# it pays to generate an identity dictionary first, and add explicit
# mappings to None for the rest
if len(identity)>=len(unmapped):
if len(identity) >= len(unmapped):
for enc in unmapped:
enc2uni[enc] = (None, "")
enc2uni['IDENTITY'] = 256
......@@ -112,44 +125,146 @@ def hexrepr(t):
len(t)
except:
return '0x%04x' % t
try:
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
except TypeError, why:
print '* failed to convert %r: %s' % (t, why)
raise
def unicoderepr(t):
def python_mapdef_code(varname, map, comments=1):
if t is None:
return 'None'
if numeric:
return hexrepr(t)
l = []
append = l.append
if map.has_key("IDENTITY"):
append("%s = codecs.make_identity_dict(range(%d))" %
(varname, map["IDENTITY"]))
append("%s.update({" % varname)
splits = 1
del map["IDENTITY"]
identity = 1
else:
try:
len(t)
except:
return repr(unichr(t))
return repr(''.join(map(unichr, t)))
append("%s = {" % varname)
splits = 0
identity = 0
mappings = map.items()
mappings.sort()
i = 0
for mapkey, mapvalue in mappings:
mapcomment = ''
if isinstance(mapkey, tuple):
(mapkey, mapcomment) = mapkey
if isinstance(mapvalue, tuple):
(mapvalue, mapcomment) = mapvalue
if mapkey is None:
continue
if (identity and
mapkey == mapvalue and
mapkey < 256):
# No need to include identity mappings, since these
# are already set for the first 256 code points.
continue
key = hexrepr(mapkey)
value = hexrepr(mapvalue)
if mapcomment and comments:
append(' %s: %s,\t# %s' % (key, value, mapcomment))
else:
append(' %s: %s,' % (key, value))
i += 1
if i == 4096:
# Split the definition into parts to that the Python
# parser doesn't dump core
if splits == 0:
append('}')
else:
append('})')
append('%s.update({' % varname)
i = 0
splits = splits + 1
if splits == 0:
append('}')
else:
append('})')
def keyrepr(t):
return l
if t is None:
return 'None'
if numeric:
return hexrepr(t)
def python_tabledef_code(varname, map, comments=1):
l = []
append = l.append
append('%s = (' % varname)
# Analyze map and create table dict
mappings = map.items()
mappings.sort()
table = {}
maxkey = 0
if map.has_key('IDENTITY'):
for key in range(256):
table[key] = (key, '')
maxkey = 255
del map['IDENTITY']
for mapkey, mapvalue in mappings:
mapcomment = ''
if isinstance(mapkey, tuple):
(mapkey, mapcomment) = mapkey
if isinstance(mapvalue, tuple):
(mapvalue, mapcomment) = mapvalue
if mapkey is None:
continue
table[mapkey] = (mapvalue, mapcomment)
if mapkey > maxkey:
maxkey = mapkey
if maxkey > MAX_TABLE_SIZE:
# Table too large
return None
# Create table code
for key in range(maxkey + 1):
if key not in table:
mapvalue = None
mapcomment = 'UNDEFINED'
else:
try:
len(t)
except:
if t < 256:
return repr(chr(t))
mapvalue, mapcomment = table[key]
if mapvalue is None:
mapchar = UNI_UNDEFINED
else:
if isinstance(mapvalue, tuple):
# 1-n mappings not supported
return None
else:
return repr(unichr(t))
return repr(''.join(map(chr, t)))
mapchar = unichr(mapvalue)
if mapcomment and comments:
append(' %r\t# %s -> %s' % (mapchar,
hexrepr(key),
mapcomment))
else:
append(' %r' % mapchar)
append(')')
return l
def codegen(name,map,comments=1):
def codegen(name, map, comments=1):
""" Returns Python source for the given map.
Comments are included in the source, if comments is true (default).
"""
# Generate code
decoding_map_code = python_mapdef_code(
'decoding_map',
map,
comments=comments)
decoding_table_code = python_tabledef_code(
'decoding_table',
map,
comments=comments)
encoding_map_code = python_mapdef_code(
'encoding_map',
codecs.make_encoding_map(map),
comments=comments)
l = [
'''\
""" Python Character Mapping Codec generated from '%s' with gencodec.py.
......@@ -167,9 +282,16 @@ class Codec(codecs.Codec):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
''' % name
]
if decoding_table_code:
l.append('''\
return codecs.charmap_decode(input,errors,decoding_table)''')
else:
l.append('''\
return codecs.charmap_decode(input,errors,decoding_map)''')
return codecs.charmap_decode(input,errors,decoding_map)
l.append('''
class StreamWriter(Codec,codecs.StreamWriter):
pass
......@@ -183,54 +305,21 @@ def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
''' % name,
]
''')
l.extend(decoding_map_code)
if map.has_key("IDENTITY"):
l.append("decoding_map = codecs.make_identity_dict(range(%d))"
% map["IDENTITY"])
l.append("decoding_map.update({")
splits = 1
del map["IDENTITY"]
else:
l.append("decoding_map = {")
splits = 0
# Add optional decoding table
if decoding_table_code:
l.append('''
### Decoding Table
''')
l.extend(decoding_table_code)
mappings = map.items()
mappings.sort()
append = l.append
i = 0
for e,value in mappings:
try:
(u,c) = value
except TypeError:
u = value
c = ''
key = keyrepr(e)
if c and comments:
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
else:
append('\t%s: %s,' % (key,unicoderepr(u)))
i += 1
if i == 4096:
# Split the definition into parts to that the Python
# parser doesn't dump core
if splits == 0:
append('}')
else:
append('})')
append('decoding_map.update({')
i = 0
splits = splits + 1
if splits == 0:
append('}')
else:
append('})')
append('''
l.append('''
### Encoding Map
encoding_map = codecs.make_encoding_map(decoding_map)
''')
l.extend(encoding_map_code)
return '\n'.join(l)
def pymap(name,map,pyfile,comments=1):
......@@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
mapnames = os.listdir(dir)
for mapname in mapnames:
mappathname = os.path.join(dir, mapname)
name = os.path.split(mapname)[1]
name = name.replace('-','_')
name = name.split('.')[0]
......@@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
if not map:
print '* map is empty; skipping'
else:
pymap(mapname, map, prefix + codefile,comments)
marshalmap(mapname, map, prefix + marshalfile)
except ValueError:
print '* conversion failed'
pymap(mappathname, map, prefix + codefile,comments)
marshalmap(mappathname, map, prefix + marshalfile)
except ValueError, why:
print '* conversion failed: %s' % why
raise
def rewritepythondir(dir,prefix='',comments=1):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment