Commit 0561926a authored by Kirill Smelkov's avatar Kirill Smelkov

strconv: Fix b & friends on macos/windows

On macos and windows, Python2 is built with --enable-unicode=ucs2, which
makes it to use UTF-16 encoding for unicode characters, and so for
characters higher than U+10000 it uses surrogate encoding with _2_
unicode points, for example:

        >>> import sys
        >>> sys.maxunicode
        65535                       <-- NOTE indicates UCS2 build
        >>> s = u'\U00012345'
        >>> s
        u'\U00012345'
        >>> s.encode('utf-8')
        '\xf0\x92\x8d\x85'
        >>> len(s)
        2                           <-- NOTE _not_ 1
        >>> s[0]
        u'\ud808'
        >>> s[1]
        u'\udf45'

This leads to e.g. b tests failing for

    # tbytes                        tunicode
    (b"\xf0\x90\x8c\xbc",           u'\U0001033c'),     # Valid 4 Octet Sequence '𐌼'

    >           assert b(tunicode) == tbytes
    E           AssertionError: assert '\xed\xa0\x80\xed\xbc\xbc' == '\xf0\x90\x8c\xbc'
    E             - \xed\xa0\x80\xed\xbc\xbc
    E             + \xf0\x90\x8c\xbc

because on UCS2 python build u'\U0001033c' is represented as 2 unicode
points:

    >>> s = u'\U0001033c'
    >>> len(s)
    2
    >>> s[0]
    u'\ud800'
    >>> s[1]
    u'\udf3c'
    >>> s[0].encode('utf-8')
    '\xed\xa0\x80'
    >>> s[1].encode('utf-8')
    '\xed\xbc\xbc'

-> Fix it by detecting UCS2 build and working around by manually
combining such surrogate unicode pairs appropriately.

A reference on the subject:

https://matthew-brett.github.io/pydagogue/python_unicode.html#utf-16-ucs2-builds-of-python-and-32-bit-unicode-code-points
parent 5cc679ac
......@@ -1601,6 +1601,10 @@ def test_strings():
# non-printable utf-8
(b'\x7f\xc2\x80\xc2\x81\xc2\x82\xc2\x83\xc2\x84\xc2\x85\xc2\x86\xc2\x87',
u"\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"),
# some characters with U >= 0x10000
(b'\xf0\x9f\x99\x8f', u'\U0001f64f'), # 🙏
(b'\xf0\x9f\x9a\x80', u'\U0001f680'), # 🚀
)
for tbytes, tunicode in testv:
......
......@@ -21,6 +21,7 @@
from __future__ import print_function, absolute_import
import sys
import six, unicodedata, codecs
from six import text_type as unicode # py2: unicode py3: str
from six import unichr # py2: unichr py3: chr
......@@ -120,7 +121,7 @@ def _quote(s):
emit(br'\x%02x' % ord(c))
# printable utf-8 characters go as is
elif unicodedata.category(unichr(r))[0] in _printable_cat0:
elif unicodedata.category(_xunichr(r))[0] in _printable_cat0:
emit(s[i:isize])
# everything else goes in numeric byte escapes
......@@ -224,6 +225,9 @@ _printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuat
_rune_error = 0xFFFD # unicode replacement character
_ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
# _utf8_decode_rune decodes next UTF8-character from byte string s.
#
# _utf8_decode_rune(s) -> (r, size)
......@@ -244,6 +248,15 @@ def _utf8_decode_rune(s):
if len(r) == 1:
return ord(r), l
# see comment in _utf8_encode_surrogateescape
if _ucs2_build and len(r) == 2:
try:
return _xuniord(r), l
# e.g. TypeError: ord() expected a character, but string of length 2 found
except TypeError:
l -= 1
continue
l -= 1
continue
......@@ -277,7 +290,7 @@ def _utf8_decode_surrogateescape(s): # -> unicode
emit(unichr(b))
else:
emit(unichr(r))
emit(_xunichr(r))
s = s[width:]
......@@ -290,12 +303,74 @@ def _utf8_encode_surrogateescape(s): # -> bytes
outv = []
emit = outv.append
for uc in s:
while len(s) > 0:
uc = s[0]; s = s[1:]
c = ord(uc)
if 0xdc80 <= c <= 0xdcff:
# surrogate - emit unescaped byte
emit(bchr(c & 0xff))
else:
emit(uc.encode('utf-8', 'strict'))
continue
# in builds with --enable-unicode=ucs2 (default for py2 on macos and windows)
# python represents unicode points > 0xffff as _two_ unicode characters:
#
# uh = u - 0x10000
# c1 = 0xd800 + (uh >> 10) ; [d800, dbff]
# c2 = 0xdc00 + (uh & 0x3ff) ; [dc00, dfff]
#
# if detected - merge those two unicode characters for .encode('utf-8') below
#
# this should be only relevant for python2, as python3 switched to "flexible"
# internal unicode representation: https://www.python.org/dev/peps/pep-0393
if _ucs2_build and (0xd800 <= c <= 0xdbff):
if len(s) > 0:
uc2 = s[0]
c2 = ord(uc2)
if 0xdc00 <= c2 <= 0xdfff:
uc = uc + uc2
s = s[1:]
emit(uc.encode('utf-8', 'strict'))
return b''.join(outv)
# _xuniord returns ordinal for a unicode character u.
#
# it works correctly even if u is represented as 2 unicode surrogate points on
# ucs2 python build.
if not _ucs2_build:
_xuniord = ord
else:
def _xuniord(u):
assert isinstance(u, unicode)
if len(u) == 1:
return ord(u)
# see _utf8_encode_surrogateescape for details
if len(u) == 2:
c1 = ord(u[0])
c2 = ord(u[1])
if (0xd800 <= c1 <= 0xdbff) and (0xdc00 <= c2 <= 0xdfff):
return 0x10000 | ((c1 - 0xd800) << 10) | (c2 - 0xdc00)
# let it crash
return ord(u)
# _xunichr returns unicode character for an ordinal i.
#
# it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are
# represented as 2 unicode pointe.
if not _ucs2_build:
_xunichr = unichr
else:
def _xunichr(i):
if i < 0x10000:
return unichr(i)
# see _utf8_encode_surrogateescape for details
uh = i - 0x10000
return unichr(0xd800 + (uh >> 10)) + \
unichr(0xdc00 + (uh & 0x3ff))
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment