Commit 13eb4498 authored by Stefan Behnel's avatar Stefan Behnel

Using Py_UNICODE to store lone surrogates makes Py3 join surrogate pairs on...

Using Py_UNICODE to store lone surrogates makes Py3 join surrogate pairs on 16-bit Unicode platforms (Windows) when reading them back in, although we correctly processed them before.
Instead, we now use the "unicode_escape" codec to store byte strings, because it can return surrogate characters (which the other codecs cannot).
parent f17dc17f
......@@ -1632,13 +1632,14 @@ class UnicodeNode(ConstNode):
# lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3
self.result_code = code.get_py_const(py_object_type, 'ustring')
data_cname = code.get_pyunicode_ptr_const(self.value)
data_cname = code.get_string_const(
StringEncoding.BytesLiteral(self.value.encode('unicode_escape')))
const_code = code.get_cached_constants_writer(self.result_code)
if const_code is None:
return # already initialised
const_code.mark_pos(self.pos)
const_code.putln(
"%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % (
"%s = PyUnicode_DecodeUnicodeEscape(%s, sizeof(%s) - 1, NULL); %s" % (
self.result_code,
data_cname,
data_cname,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment