Commit 21013488 authored by Tim Peters's avatar Tim Peters

Fiddled w/ /F's cool new splitbins function: documented it, generalized it

a bit, sped it a lot primarily by removing the unused assumption that None was
a legit bin entry (the function doesn't really need to assume that there's
anything special about 0), added an optional "trace" argument, and in __debug__
mode added exhaustive verification that the decomposition is both correct and
doesn't overstep any array bounds (which wasn't obvious to me from staring at the
generated C code -- now I feel safe!).  Did not commit a new unicodedata_db.h, as
the one produced by this version is identical to the one already checked in.
parent 68ded6e6
...@@ -165,38 +165,66 @@ def getsize(data): ...@@ -165,38 +165,66 @@ def getsize(data):
else: else:
return 4 return 4
def splitbins(bins): def splitbins(t, trace=0):
# split a sparse integer table into two tables, such as: """t, trace=0 -> (t1, t2, shift). Split a table to save space.
# value = t2[(t1[char>>shift]<<shift)+(char&mask)]
# and value == 0 means no data t is a sequence of ints. This function can be useful to save space if
bytes = sys.maxint many of the ints are the same. t1 and t2 are lists of ints, and shift
for shift in range(16): is an int, chosen to minimize the combined size of t1 and t2 (in C
bin1 = [] code), and where for each i in range(len(t)),
bin2 = [] t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
If optional arg trace is true (default false), progress info is
printed to sys.stderr.
"""
import sys
if trace:
def dump(t1, t2, shift, bytes):
print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
len(t1), len(t2), shift, bytes)
print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
"bytes"
n = len(t)-1 # last valid index
maxshift = 0 # the most we can shift n and still have something left
if n > 0:
while n >> 1:
n >>= 1
maxshift += 1
del n
bytes = sys.maxint # smallest total size so far
t = tuple(t) # so slices can be dict keys
for shift in range(maxshift + 1):
t1 = []
t2 = []
size = 2**shift size = 2**shift
bincache = {} bincache = {}
for i in range(0, len(bins), size): for i in range(0, len(t), size):
bin = bins[i:i+size] bin = t[i:i+size]
index = bincache.get(tuple(bin)) index = bincache.get(bin)
if index is None: if index is None:
index = len(bin2) index = len(t2)
bincache[tuple(bin)] = index bincache[bin] = index
for v in bin: t2.extend(bin)
if v is None: t1.append(index >> shift)
bin2.append(0)
else:
bin2.append(v)
bin1.append(index>>shift)
# determine memory size # determine memory size
b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2) b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
if trace:
dump(t1, t2, shift, b)
if b < bytes: if b < bytes:
best = shift, bin1, bin2 best = t1, t2, shift
bytes = b bytes = b
shift, bin1, bin2 = best t1, t2, shift = best
## print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % ( if trace:
## len(bin1), len(bin2), shift, bytes print >>sys.stderr, "Best:",
## ) dump(t1, t2, shift, bytes)
return bin1, bin2, shift if __debug__:
# exhaustively verify that the decomposition is correct
mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
for i in xrange(len(t)):
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
return best
if __name__ == "__main__": if __name__ == "__main__":
maketable() maketable()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment