Commit 5cc679ac authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Switch _utf8_decode_rune to return rune ordinal instead of unicode character

This is a preparatory step for the next patch where we'll be fixing
strconv for Python2 builds with --enable-unicode=ucs2, where a unicode
character can be taking _2_ unicode points.

In that general case relying on unicode objects to represent runes is
not good, because many things generally do not work for U+10000 and
above, e.g. ord breaks:

    >>> import sys
    >>> sys.maxunicode
    65535                       <-- NOTE indicates UCS2 build
    >>> s = u'\U00012345'
    >>> s
    u'\U00012345'
    >>> s.encode('utf-8')
    '\xf0\x92\x8d\x85'
    >>> len(s)
    2                           <-- NOTE _not_ 1
    >>> ord(s)
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    TypeError: ord() expected a character, but string of length 2 found

so we switch to represent runes as integer, similarly to what Go does.
parent cd67996e
......@@ -120,7 +120,7 @@ def _quote(s):
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
elif unicodedata.category(r)[0] in _printable_cat0:
elif unicodedata.category(unichr(r))[0] in _printable_cat0:
emit(s[i:isize])
# everything else goes in numeric byte escapes
......@@ -170,12 +170,12 @@ def _unquote_next(s):
if width == 0:
raise ValueError('no closing "')
if r == u'"':
if r == ord('"'):
s = s[1:]
break
# regular UTF-8 character
if r != u'\\':
if r != ord('\\'):
emit(s[:width])
s = s[width:]
continue
......@@ -222,7 +222,7 @@ def _unquote_next(s):
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character
_rune_error = 0xFFFD # unicode replacement character
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
......@@ -231,7 +231,7 @@ def _utf8_decode_rune(s):
assert isinstance(s, bytes)
if len(s) == 0:
return '', 0
return _rune_error, 0
l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
......@@ -242,7 +242,7 @@ def _utf8_decode_rune(s):
continue
if len(r) == 1:
return r, l
return ord(r), l
l -= 1
continue
......@@ -268,7 +268,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
# surrogates are not valid UTF-8:
# https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
# (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= ord(r) < 0xdfff:
elif 0xd800 <= r < 0xdfff:
for c in s[:width]:
b = ord(c)
if c >= 0x80:
......@@ -277,7 +277,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
emit(unichr(b))
else:
emit(r)
emit(unichr(r))
s = s[width:]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment