Commit 5cc679ac authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Switch _utf8_decode_rune to return rune ordinal instead of unicode character

This is a preparatory step for the next patch where we'll be fixing
strconv for Python2 builds with --enable-unicode=ucs2, where a unicode
character can be taking _2_ unicode points.

In that general case relying on unicode objects to represent runes is
not good, because many things generally do not work for U+10000 and
above, e.g. ord breaks:

    >>> import sys
    >>> sys.maxunicode
    65535                       <-- NOTE indicates UCS2 build
    >>> s = u'\U00012345'
    >>> s
    u'\U00012345'
    >>> s.encode('utf-8')
    '\xf0\x92\x8d\x85'
    >>> len(s)
    2                           <-- NOTE _not_ 1
    >>> ord(s)
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
    TypeError: ord() expected a character, but string of length 2 found

so we switch to represent runes as integer, similarly to what Go does.
parent cd67996e
...@@ -120,7 +120,7 @@ def _quote(s): ...@@ -120,7 +120,7 @@ def _quote(s):
emit(br'\x%02x' % ord(c)) emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is # printable utf-8 characters go as is
elif unicodedata.category(r)[0] in _printable_cat0: elif unicodedata.category(unichr(r))[0] in _printable_cat0:
emit(s[i:isize]) emit(s[i:isize])
# everything else goes in numeric byte escapes # everything else goes in numeric byte escapes
...@@ -170,12 +170,12 @@ def _unquote_next(s): ...@@ -170,12 +170,12 @@ def _unquote_next(s):
if width == 0: if width == 0:
raise ValueError('no closing "') raise ValueError('no closing "')
if r == u'"': if r == ord('"'):
s = s[1:] s = s[1:]
break break
# regular UTF-8 character # regular UTF-8 character
if r != u'\\': if r != ord('\\'):
emit(s[:width]) emit(s[:width])
s = s[width:] s = s[width:]
continue continue
...@@ -222,7 +222,7 @@ def _unquote_next(s): ...@@ -222,7 +222,7 @@ def _unquote_next(s):
_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols _printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols
_rune_error = u'\uFFFD' # unicode replacement character _rune_error = 0xFFFD # unicode replacement character
# _utf8_decode_rune decodes next UTF8-character from byte string s. # _utf8_decode_rune decodes next UTF8-character from byte string s.
# #
...@@ -231,7 +231,7 @@ def _utf8_decode_rune(s): ...@@ -231,7 +231,7 @@ def _utf8_decode_rune(s):
assert isinstance(s, bytes) assert isinstance(s, bytes)
if len(s) == 0: if len(s) == 0:
return '', 0 return _rune_error, 0
l = min(len(s), 4) # max size of an UTF-8 encoded character l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0: while l > 0:
...@@ -242,7 +242,7 @@ def _utf8_decode_rune(s): ...@@ -242,7 +242,7 @@ def _utf8_decode_rune(s):
continue continue
if len(r) == 1: if len(r) == 1:
return r, l return ord(r), l
l -= 1 l -= 1
continue continue
...@@ -268,7 +268,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode ...@@ -268,7 +268,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
# surrogates are not valid UTF-8: # surrogates are not valid UTF-8:
# https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157 # https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
# (python3 raises UnicodeDecodeError for surrogates) # (python3 raises UnicodeDecodeError for surrogates)
elif 0xd800 <= ord(r) < 0xdfff: elif 0xd800 <= r < 0xdfff:
for c in s[:width]: for c in s[:width]:
b = ord(c) b = ord(c)
if c >= 0x80: if c >= 0x80:
...@@ -277,7 +277,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode ...@@ -277,7 +277,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
emit(unichr(b)) emit(unichr(b))
else: else:
emit(r) emit(unichr(r))
s = s[width:] s = s[width:]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment