Commit f42dc448 authored by Victor Stinner's avatar Victor Stinner

PyUnicode_CopyCharacters() fails when copying latin1 into ascii

parent c53be96c
...@@ -455,6 +455,46 @@ _PyUnicode_New(Py_ssize_t length) ...@@ -455,6 +455,46 @@ _PyUnicode_New(Py_ssize_t length)
return NULL; return NULL;
} }
static const char*
unicode_kind_name(PyObject *unicode)
{
assert(PyUnicode_Check(unicode));
if (!PyUnicode_IS_COMPACT(unicode))
{
if (!PyUnicode_IS_READY(unicode))
return "wstr";
switch(PyUnicode_KIND(unicode))
{
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_COMPACT_ASCII(unicode))
return "legacy ascii";
else
return "legacy latin1";
case PyUnicode_2BYTE_KIND:
return "legacy UCS2";
case PyUnicode_4BYTE_KIND:
return "legacy UCS4";
default:
return "<legacy invalid kind>";
}
}
assert(PyUnicode_IS_READY(unicode));
switch(PyUnicode_KIND(unicode))
{
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_COMPACT_ASCII(unicode))
return "ascii";
else
return "compact latin1";
case PyUnicode_2BYTE_KIND:
return "compact UCS2";
case PyUnicode_4BYTE_KIND:
return "compact UCS4";
default:
return "<invalid compact kind>";
}
}
#ifdef Py_DEBUG #ifdef Py_DEBUG
int unicode_new_new_calls = 0; int unicode_new_new_calls = 0;
...@@ -672,8 +712,10 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -672,8 +712,10 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
to_kind = PyUnicode_KIND(to); to_kind = PyUnicode_KIND(to);
to_data = PyUnicode_DATA(to); to_data = PyUnicode_DATA(to);
if (from_kind == to_kind) { if (from_kind == to_kind
/* fast path */ /* deny latin1 => ascii */
&& PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
{
Py_MEMCPY((char*)to_data Py_MEMCPY((char*)to_data
+ PyUnicode_KIND_SIZE(to_kind, to_start), + PyUnicode_KIND_SIZE(to_kind, to_start),
(char*)from_data (char*)from_data
...@@ -712,7 +754,14 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -712,7 +754,14 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
} }
else { else {
int invalid_kinds; int invalid_kinds;
if (from_kind > to_kind) {
/* check if max_char(from substring) <= max_char(to) */
if (from_kind > to_kind
/* latin1 => ascii */
|| (PyUnicode_IS_COMPACT_ASCII(to)
&& to_kind == PyUnicode_1BYTE_KIND
&& !PyUnicode_IS_COMPACT_ASCII(from)))
{
/* slow path to check for character overflow */ /* slow path to check for character overflow */
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Py_UCS4 ch, maxchar; Py_UCS4 ch, maxchar;
...@@ -736,10 +785,10 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -736,10 +785,10 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
invalid_kinds = 1; invalid_kinds = 1;
if (invalid_kinds) { if (invalid_kinds) {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"Cannot copy UCS%u characters " "Cannot copy %s characters "
"into a string of UCS%u characters", "into a string of %s characters",
1 << (from_kind - 1), unicode_kind_name(from),
1 << (to_kind -1)); unicode_kind_name(to));
return -1; return -1;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment