Commit e6894c0f authored by Victor Stinner's avatar Victor Stinner

PyUnicode_CopyCharacters() checks for buffer and character overflow

It now returns the number of written characters on success.
parent 9491968f
...@@ -519,10 +519,22 @@ PyAPI_FUNC(int) _PyUnicode_Ready( ...@@ -519,10 +519,22 @@ PyAPI_FUNC(int) _PyUnicode_Ready(
#endif #endif
/* Copy character from one unicode object into another, this function performs /* Copy character from one unicode object into another, this function performs
character conversion when nessesary and falls back to memcpy if possible. character conversion when necessary and falls back to memcpy if possible.
Return -1 and raise an exception on error, return 0 on success. */
Fail if 'to' is smaller than how_many or smaller than len(from)-from_start,
or if kind(from[from_start:from_start+how_many]) > kind(to).
Return the number of written character, or return -1 and raise an exception
on error.
Pseudo-code:
how_many = min(how_many, len(from) - from_start)
to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
return how_many
*/
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_CopyCharacters( PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
PyObject *to, PyObject *to,
Py_ssize_t to_start, Py_ssize_t to_start,
PyObject *from, PyObject *from,
......
...@@ -606,13 +606,13 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, ...@@ -606,13 +606,13 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
} }
#endif #endif
int Py_ssize_t
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
PyObject *from, Py_ssize_t from_start, PyObject *from, Py_ssize_t from_start,
Py_ssize_t how_many) Py_ssize_t how_many)
{ {
int from_kind; unsigned int from_kind;
int to_kind; unsigned int to_kind;
assert(PyUnicode_Check(from)); assert(PyUnicode_Check(from));
assert(PyUnicode_Check(to)); assert(PyUnicode_Check(to));
...@@ -622,94 +622,89 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -622,94 +622,89 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
if (PyUnicode_READY(to)) if (PyUnicode_READY(to))
return -1; return -1;
how_many = PY_MIN(PyUnicode_GET_LENGTH(from), how_many);
if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
PyErr_Format(PyExc_ValueError,
"Cannot write %zi characters at %zi "
"in a string of %zi characters",
how_many, to_start, PyUnicode_GET_LENGTH(to));
return -1;
}
from_kind = PyUnicode_KIND(from); from_kind = PyUnicode_KIND(from);
to_kind = PyUnicode_KIND(to); to_kind = PyUnicode_KIND(to);
if (from_kind == to_kind) { if (from_kind == to_kind) {
const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(to); /* fast path */
Py_MEMCPY(PyUnicode_1BYTE_DATA(to) + (to_start * char_size), Py_MEMCPY((char*)PyUnicode_DATA(to)
PyUnicode_1BYTE_DATA(from) + (from_start * char_size), + PyUnicode_KIND_SIZE(to_kind, to_start),
how_many * char_size); (char*)PyUnicode_DATA(from)
return 0; + PyUnicode_KIND_SIZE(from_kind, from_start),
} PyUnicode_KIND_SIZE(to_kind, how_many));
return how_many;
}
if (from_kind > to_kind) {
/* slow path to check for character overflow */
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
void *from_data = PyUnicode_DATA(from);
void *to_data = PyUnicode_DATA(to);
Py_UCS4 ch, maxchar;
Py_ssize_t i;
int overflow;
switch (from_kind) { maxchar = 0;
case PyUnicode_1BYTE_KIND: for (i=0; i < how_many; i++) {
switch (to_kind) { ch = PyUnicode_READ(from_kind, from_data, from_start + i);
case PyUnicode_2BYTE_KIND: if (ch > maxchar) {
_PyUnicode_CONVERT_BYTES( maxchar = ch;
unsigned char, Py_UCS2, if (maxchar > to_maxchar) {
PyUnicode_1BYTE_DATA(from) + from_start, overflow = 1;
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
PyUnicode_2BYTE_DATA(to) + to_start
);
break;
case PyUnicode_4BYTE_KIND:
_PyUnicode_CONVERT_BYTES(
unsigned char, Py_UCS4,
PyUnicode_1BYTE_DATA(from) + from_start,
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
PyUnicode_4BYTE_DATA(to) + to_start
);
break;
default:
goto invalid_state;
}
break;
case PyUnicode_2BYTE_KIND:
switch (to_kind) {
case PyUnicode_1BYTE_KIND:
_PyUnicode_CONVERT_BYTES(
Py_UCS2, unsigned char,
PyUnicode_2BYTE_DATA(from) + from_start,
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
PyUnicode_1BYTE_DATA(to) + to_start
);
break;
case PyUnicode_4BYTE_KIND:
_PyUnicode_CONVERT_BYTES(
Py_UCS2, Py_UCS4,
PyUnicode_2BYTE_DATA(from) + from_start,
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
PyUnicode_4BYTE_DATA(to) + to_start
);
break;
default:
goto invalid_state;
}
break;
case PyUnicode_4BYTE_KIND:
switch (to_kind) {
case PyUnicode_1BYTE_KIND:
_PyUnicode_CONVERT_BYTES(
Py_UCS4, unsigned char,
PyUnicode_4BYTE_DATA(from) + from_start,
PyUnicode_4BYTE_DATA(from) + from_start + how_many,
PyUnicode_1BYTE_DATA(to) + to_start
);
break;
case PyUnicode_2BYTE_KIND:
_PyUnicode_CONVERT_BYTES(
Py_UCS4, Py_UCS2,
PyUnicode_4BYTE_DATA(from) + from_start,
PyUnicode_4BYTE_DATA(from) + from_start + how_many,
PyUnicode_2BYTE_DATA(to) + to_start
);
break; break;
default: }
goto invalid_state;
} }
break; PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
default: }
goto invalid_state; if (!overflow)
return how_many;
}
else if (from_kind == PyUnicode_1BYTE_KIND && to_kind == PyUnicode_2BYTE_KIND)
{
_PyUnicode_CONVERT_BYTES(
Py_UCS1, Py_UCS2,
PyUnicode_1BYTE_DATA(from) + from_start,
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
PyUnicode_2BYTE_DATA(to) + to_start
);
return how_many;
}
else if (from_kind == PyUnicode_1BYTE_KIND
&& to_kind == PyUnicode_4BYTE_KIND)
{
_PyUnicode_CONVERT_BYTES(
Py_UCS1, Py_UCS4,
PyUnicode_1BYTE_DATA(from) + from_start,
PyUnicode_1BYTE_DATA(from) + from_start + how_many,
PyUnicode_4BYTE_DATA(to) + to_start
);
return how_many;
}
else if (from_kind == PyUnicode_2BYTE_KIND
&& to_kind == PyUnicode_4BYTE_KIND)
{
_PyUnicode_CONVERT_BYTES(
Py_UCS2, Py_UCS4,
PyUnicode_2BYTE_DATA(from) + from_start,
PyUnicode_2BYTE_DATA(from) + from_start + how_many,
PyUnicode_4BYTE_DATA(to) + to_start
);
return how_many;
} }
return 0;
invalid_state:
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"Impossible kind state (from=%i, to=%i) " "Cannot copy UCS%u characters "
"in PyUnicode_CopyCharacters", "into a string of UCS%u characters",
from_kind, to_kind); 1 << (from_kind - 1),
1 << (to_kind -1));
return -1; return -1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment