Commit 5ee2de80 authored by Guido van Rossum's avatar Guido van Rossum

There's no point in encoding the number of continuation bytes in the

first byte -- we always find the end of a particular encoded number by
searching for the next byte with the high bit set.  This simplifies
the encoding and gives us more space for small encodings: 128 values
can now be encoded in 1 byte, and 16K in 2 bytes.
parent 61e89f2f
...@@ -3,45 +3,49 @@ ...@@ -3,45 +3,49 @@
# ordinary string .find() method can be used to find the encoded form of a # ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte # desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't # of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding. # be fooled into starting a match "in the middle" of an encoding. Unlike
# UTF-8, the initial byte does not tell you how many continuation bytes
# follow; and there's no ASCII superset property.
# Details: # Details:
# #
# + Only the first byte of an encoding has the sign bit set. # + Only the first byte of an encoding has the sign bit set.
# #
# + The number of bytes in the encoding is encoded in unary at the start of # + The first byte has 7 bits of data.
# the first byte (i.e., an encoding with n bytes begins with n 1-bits
# followed by a 0 bit).
# #
# + Bytes beyond the first in an encoding have the sign bit clear, followed # + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data. # by 7 bits of data.
# #
# + The number of data bits in the first byte of an encoding varies. # + The first byte doesn't tell you how many continuation bytes are
# following. You can tell by searching for the next byte with the
# high bit set (or the end of the string).
# #
# The int to be encoded can contain no more than 24 bits. # The int to be encoded can contain no more than 24 bits.
# XXX this could certainly be increased # XXX this could certainly be increased
# #
# If it contains no more than 6 bits, 00abcdef, the encoding is # If it contains no more than 7 bits, 0abcdefg, the encoding is
# 10abcdef # 1abcdefg
# #
# If it contains 7 thru 12 bits, # If it contains 8 thru 14 bits,
# 0000abcd efghijkL # 00abcdef ghijkLmn
# the encoding is # the encoding is
# 110abcde 0fghijkL # 1abcdefg 0hijkLmn
# #
# Static tables _encoding and _decoding capture all encodes and decodes for # Static tables _encoding and _decoding capture all encodes and decodes for
# 12 or fewer bits. # 14 or fewer bits.
# #
# If it contains 13 thru 18 bits, # If it contains 15 thru 21 bits,
# 000000ab cdefghij kLmnopqr # 000abcde fghijkLm nopqrstu
# the encoding is # the encoding is
# 1110abcd 0efghijk 0Lmnopqr # 1abcdefg 0hijkLmn 0opqrstu
# #
# If it contains 19 thru 24 bits, # If it contains 22 thru 28 bits,
# abcdefgh ijkLmnop qrstuvwx # 0000abcd efghijkL mnopqrst uvwxyzAB
# the encoding is # the encoding is
# 11110abc 0defghij 0kLmnopq 0rstuvwx # 1abcdefg 0hijkLmn 0opqrstu 0vwxyzAB
assert 0x80**2 == 0x4000
assert 0x80**4 == 0x10000000
import re import re
...@@ -51,18 +55,18 @@ def encode(wids): ...@@ -51,18 +55,18 @@ def encode(wids):
n = len(wid2enc) n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids]) return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x1000 # Filled later, and converted to a tuple _encoding = [None] * 0x4000 # Filled later, and converted to a tuple
def _encode(w): def _encode(w):
assert 0x1000 <= w < 0x1000000 assert 0x4000 <= w < 0x10000000
b, c = divmod(w, 0x80) b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80) a, b = divmod(b, 0x80)
s = chr(b) + chr(c) s = chr(b) + chr(c)
if a < 0x10: # no more than 18 data bits if a < 0x80: # no more than 21 data bits
return chr(a + 0xE0) + s return chr(a + 0x80) + s
a, b = divmod(a, 0x80) a, b = divmod(a, 0x80)
assert a < 0x4, (w, a, b, s) # else more than 24 data bits assert a < 0x80, (w, a, b, s) # else more than 28 data bits
return (chr(a + 0xF0) + chr(b)) + s return (chr(a + 0x80) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*") _prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
...@@ -81,22 +85,22 @@ def _decode(s): ...@@ -81,22 +85,22 @@ def _decode(s):
return 0 return 0
if len(s) == 3: if len(s) == 3:
a, b, c = map(ord, s) a, b, c = map(ord, s)
assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80 assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80
return ((a & 0xF) << 14) | (b << 7) | c return ((a & 0x7F) << 14) | (b << 7) | c
assert len(s) == 4, `s` assert len(s) == 4, `s`
a, b, c, d = map(ord, s) a, b, c, d = map(ord, s)
assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80 assert a & 0x80 == 0x80 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d return ((a & 0x7F) << 21) | (b << 14) | (c << 7) | d
def _fill(): def _fill():
global _encoding global _encoding
for i in range(0x40): for i in range(0x80):
s = chr(i + 0x80) s = chr(i + 0x80)
_encoding[i] = s _encoding[i] = s
_decoding[s] = i _decoding[s] = i
for i in range(0x40, 0x1000): for i in range(0x80, 0x4000):
hi, lo = divmod(i, 0x80) hi, lo = divmod(i, 0x80)
s = chr(hi + 0xC0) + chr(lo) s = chr(hi + 0x80) + chr(lo)
_encoding[i] = s _encoding[i] = s
_decoding[s] = i _decoding[s] = i
_encoding = tuple(_encoding) _encoding = tuple(_encoding)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment