Commit 85486ea2 authored by Nikita Nemkin's avatar Nikita Nemkin

Full support for Py_UNICODE[] literals with non-BMP characters.

parent 3ce78016
...@@ -778,15 +778,6 @@ class StringConst(object): ...@@ -778,15 +778,6 @@ class StringConst(object):
self.py_strings[key] = py_string self.py_strings[key] = py_string
return py_string return py_string
class UnicodeConst(object):
"""Global info about a Py_UNICODE[] constant held by GlobalState.
"""
# cname string
# text EncodedString (unicode)
def __init__(self, cname, text):
self.cname = cname
self.text = text
class PyStringConst(object): class PyStringConst(object):
"""Global info about a Python string constant held by GlobalState. """Global info about a Python string constant held by GlobalState.
...@@ -1033,8 +1024,7 @@ class GlobalState(object): ...@@ -1033,8 +1024,7 @@ class GlobalState(object):
try: try:
c = self.unicode_const_index[text] c = self.unicode_const_index[text]
except KeyError: except KeyError:
c = UnicodeConst(self.new_const_cname(), text) c = self.unicode_const_index[text] = self.new_const_cname()
self.unicode_const_index[text] = c
return c return c
def get_py_string_const(self, text, identifier=None, def get_py_string_const(self, text, identifier=None,
...@@ -1162,9 +1152,16 @@ class GlobalState(object): ...@@ -1162,9 +1152,16 @@ class GlobalState(object):
for py_string in c.py_strings.values(): for py_string in c.py_strings.values():
py_strings.append((c.cname, len(py_string.cname), py_string)) py_strings.append((c.cname, len(py_string.cname), py_string))
for c in self.unicode_const_index.values(): for c, cname in self.unicode_const_index.items():
decls_writer.putln('static Py_UNICODE %s[] = { %s };' % ( utf16_array, utf32_array = StringEncoding.encode_py_unicode_string(c)
c.cname, StringEncoding.encode_py_unicode_string(c.text))) if utf16_array:
# Narrow and wide representations differ
decls_writer.putln("#if Py_UNICODE_WIDE")
decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf32_array))
if utf16_array:
decls_writer.putln("#else")
decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf16_array))
decls_writer.putln("#endif")
if py_strings: if py_strings:
self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c")) self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c"))
...@@ -1461,7 +1458,7 @@ class CCodeWriter(object): ...@@ -1461,7 +1458,7 @@ class CCodeWriter(object):
return self.globalstate.get_string_const(text).cname return self.globalstate.get_string_const(text).cname
def get_unicode_const(self, text): def get_unicode_const(self, text):
return self.globalstate.get_unicode_const(text).cname return self.globalstate.get_unicode_const(text)
def get_py_string_const(self, text, identifier=None, def get_py_string_const(self, text, identifier=None,
is_str=False, unicode_value=None): is_str=False, unicode_value=None):
......
...@@ -1242,8 +1242,6 @@ class UnicodeNode(ConstNode): ...@@ -1242,8 +1242,6 @@ class UnicodeNode(ConstNode):
if self.type.is_pyobject: if self.type.is_pyobject:
self.result_code = code.get_py_string_const(self.value) self.result_code = code.get_py_string_const(self.value)
else: else:
if self.contains_surrogates():
warning(self.pos, "Py_UNICODE* literals with characters outside BMP are not portable.", level=1);
self.result_code = code.get_unicode_const(self.value) self.result_code = code.get_unicode_const(self.value)
def calculate_result_code(self): def calculate_result_code(self):
......
...@@ -267,10 +267,18 @@ def split_string_literal(s, limit=2000): ...@@ -267,10 +267,18 @@ def split_string_literal(s, limit=2000):
def encode_py_unicode_string(s): def encode_py_unicode_string(s):
"""Create Py_UNICODE[] representation of a given unicode string. """Create Py_UNICODE[] representation of a given unicode string.
""" """
# Non-BMP characters will appear as surrogates, which is not compatible with utf32_array = array.array('i', s.encode('UTF-32'))
# wide (UTF-32) Python builds. UnicodeNode will warn the user about this. assert utf32_array.itemsize == 4
utf32_array.pop(0) # Remove BOM
utf32_array.append(0) # Add NULL terminator
for c in utf32_array:
if c > 65535:
utf16_array = array.array('H', s.encode('UTF-16'))
utf16_array.pop(0) # Remove BOM
utf16_array.append(0) # Add NULL terminator
break
else:
utf16_array = []
a = array.array('H', s.encode('UTF-16')) return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array))
a.pop(0) # Remove BOM
a.append(0) # Add NULL terminator
return u",".join(map(unicode, a))
...@@ -19,6 +19,8 @@ cdef Py_UNICODE c_pu_arr[42] ...@@ -19,6 +19,8 @@ cdef Py_UNICODE c_pu_arr[42]
cdef LPWSTR c_wstr = u"unicode\u1234" cdef LPWSTR c_wstr = u"unicode\u1234"
cdef Py_UNICODE* c_pu_empty = u"" cdef Py_UNICODE* c_pu_empty = u""
cdef char* c_empty = "" cdef char* c_empty = ""
cdef unicode uwide_literal = u'\U00020000\U00020001'
cdef Py_UNICODE* c_pu_wide_literal = u'\U00020000\U00020001'
memcpy(c_pu_arr, c_pu_str, sizeof(Py_UNICODE) * (len(uobj) + 1)) memcpy(c_pu_arr, c_pu_str, sizeof(Py_UNICODE) * (len(uobj) + 1))
...@@ -55,6 +57,12 @@ def test_c_to_python(): ...@@ -55,6 +57,12 @@ def test_c_to_python():
assert sizeof(c_pu_arr) == sizeof(Py_UNICODE) * 42 assert sizeof(c_pu_arr) == sizeof(Py_UNICODE) * 42
assert sizeof(c_pu_str) == sizeof(void*) assert sizeof(c_pu_str) == sizeof(void*)
assert c_pu_wide_literal == uwide_literal
if sizeof(Py_UNICODE) >= 4:
assert len(c_pu_wide_literal) == 2
else:
assert len(c_pu_wide_literal) == 4
assert u'unicode' assert u'unicode'
assert not u'' assert not u''
assert c_pu_str assert c_pu_str
...@@ -80,5 +88,7 @@ def test_python_to_c(): ...@@ -80,5 +88,7 @@ def test_python_to_c():
u = uobj[1] u = uobj[1]
assert Py_UNICODE_equal(<Py_UNICODE*>u"n", u) assert Py_UNICODE_equal(<Py_UNICODE*>u"n", u)
assert Py_UNICODE_equal(uwide_literal, <Py_UNICODE*>c_pu_wide_literal)
assert len(u"abc\0") == 4 assert len(u"abc\0") == 4
assert len(<Py_UNICODE*>u"abc\0") == 3 assert len(<Py_UNICODE*>u"abc\0") == 3
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment