strconv: Switch _utf8_decode_rune to return rune ordinal instead of unicode character

This is a preparatory step for the next patch where we'll be fixing strconv for Python2 builds with --enable-unicode=ucs2, where a unicode character can be taking _2_ unicode points. In that general case relying on unicode objects to represent runes is not good, because many things generally do not work for U+10000 and above, e.g. ord breaks: >>> import sys >>> sys.maxunicode 65535 <-- NOTE indicates UCS2 build >>> s = u'\U00012345' >>> s u'\U00012345' >>> s.encode('utf-8') '\xf0\x92\x8d\x85' >>> len(s) 2 <-- NOTE _not_ 1 >>> ord(s) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: ord() expected a character, but string of length 2 found so we switch to represent runes as integer, similarly to what Go does.

strconv: Switch _utf8_decode_rune to return rune ordinal instead of unicode character
This is a preparatory step for the next patch where we'll be fixing strconv for Python2 builds with --enable-unicode=ucs2, where a unicode character can be taking _2_ unicode points. In that general case relying on unicode objects to represent runes is not good, because many things generally do not work for U+10000 and above, e.g. ord breaks: >>> import sys >>> sys.maxunicode 65535 <-- NOTE indicates UCS2 build >>> s = u'\U00012345' >>> s u'\U00012345' >>> s.encode('utf-8') '\xf0\x92\x8d\x85' >>> len(s) 2 <-- NOTE _not_ 1 >>> ord(s) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: ord() expected a character, but string of length 2 found so we switch to represent runes as integer, similarly to what Go does.
5cc679ac · Kirill Smelkov · cd67996e · 5cc679ac
Commit 5cc679ac authored Feb 28, 2020 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 8 deletions

golang/strconv.py golang/strconv.py +8 -8

No files found.
--- a/golang/strconv.py
+++ b/golang/strconv.py
@@ -120,7 +120,7 @@ def _quote(s):
                emit(br'\x%02x' % ord(c))

            # printable utf-8 characters go as is
-            elif unicodedata.category(r)[0] in _printable_cat0:
+            elif unicodedata.category(unichr(r))[0] in _printable_cat0:
                emit(s[i:isize])

            # everything else goes in numeric byte escapes
@@ -170,12 +170,12 @@ def _unquote_next(s):
        if width == 0:
            raise ValueError('no closing "')

-        if r == u'"':
+        if r == ord('"'):
            s = s[1:]
            break

        # regular UTF-8 character
-        if r != u'\\':
+        if r != ord('\\'):
            emit(s[:width])
            s = s[width:]
            continue
@@ -222,7 +222,7 @@ def _unquote_next(s):

 _printable_cat0 = frozenset(['L', 'N', 'P', 'S'])   # letters, numbers, punctuation, symbols

-_rune_error = u'\uFFFD' # unicode replacement character
+_rune_error = 0xFFFD # unicode replacement character

 # _utf8_decode_rune decodes next UTF8-character from byte string s.
 #
@@ -231,7 +231,7 @@ def _utf8_decode_rune(s):
    assert isinstance(s, bytes)

    if len(s) == 0:
-        return '', 0
+        return _rune_error, 0

    l = min(len(s), 4)  # max size of an UTF-8 encoded character
    while l > 0:
@@ -242,7 +242,7 @@ def _utf8_decode_rune(s):
            continue

        if len(r) == 1:
-            return r, l
+            return ord(r), l

        l -= 1
        continue
@@ -268,7 +268,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
        # surrogates are not valid UTF-8:
        # https://github.com/python/cpython/blob/v3.8.1-118-gdbb37aac142/Objects/stringlib/codecs.h#L153-L157
        # (python3 raises UnicodeDecodeError for surrogates)
-        elif 0xd800 <= ord(r) < 0xdfff:
+        elif 0xd800 <= r < 0xdfff:
            for c in s[:width]:
                b = ord(c)
                if c >= 0x80:
@@ -277,7 +277,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
                    emit(unichr(b))

        else:
-            emit(r)
+            emit(unichr(r))

        s = s[width:]