Commit 9c121069 authored by Martin v. Löwis's avatar Martin v. Löwis

Change PyUnicode_FromString[AndSize] to expect UTF-8.

parent 64ce5052
......@@ -996,10 +996,11 @@ use these APIs:
\var{u} is \NULL{}.
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char *u}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromStringAndSize}{const char *u,
Py_ssize_t size}
Create a Unicode Object from the char buffer \var{u}.
\var{u} must be 0-terminated, the bytes will be interpreted as
being latin-1 encoded. \var{u} may also be \NULL{} which causes the
The bytes will be interpreted as being UTF-8 encoded.
\var{u} may also be \NULL{} which causes the
contents to be undefined. It is the user's responsibility to fill
in the needed data. The buffer is copied into the new object.
If the buffer is not \NULL{}, the return value might be a shared object.
......@@ -1008,6 +1009,12 @@ use these APIs:
\versionadded{3.0}
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromString}{const char*u}
Create a Unicode object from an UTF-8 encoded null-terminated
char buffer \var{u}.
\versionadded{3.0}
\end{funcdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromFormat}{const char *format, ...}
Take a C \cfunction{printf()}-style \var{format} string and a
variable number of arguments, calculate the size of the resulting
......
......@@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
static PyObject *
bytes_reduce(PyBytesObject *self)
{
return Py_BuildValue("(O(s#s))",
Py_Type(self),
self->ob_bytes == NULL ? "" : self->ob_bytes,
Py_Size(self),
"latin-1");
PyObject *latin1;
if (self->ob_bytes)
latin1 = PyUnicode_DecodeLatin1(self->ob_bytes,
Py_Size(self), NULL);
else
latin1 = PyUnicode_FromString("");
return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
}
static PySequenceMethods bytes_as_sequence = {
......
......@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
PyUnicodeObject *unicode;
/* If the Unicode data is known at construction time, we can apply
some optimizations which share commonly used objects. */
some optimizations which share commonly used objects.
Also, this means the input must be UTF-8, so fall back to the
UTF-8 decoder at the end. */
if (u != NULL) {
/* Optimization for empty strings */
......@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
return (PyObject *)unicode_empty;
}
/* Single characters are shared when using this constructor */
if (size == 1) {
/* Single characters are shared when using this constructor.
Restrict to ASCII, since the input must be UTF-8. */
if (size == 1 && Py_CHARMASK(*u) < 128) {
unicode = unicode_latin1[Py_CHARMASK(*u)];
if (!unicode) {
unicode = _PyUnicode_New(1);
......@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Py_INCREF(unicode);
return (PyObject *)unicode;
}
return PyUnicode_DecodeUTF8(u, size, NULL);
}
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
/* Copy the Unicode data into the new object */
if (u != NULL) {
Py_UNICODE *p = unicode->str;
while (size--)
*p++ = Py_CHARMASK(*u++);
/* Don't need to write trailing 0 because
that's already done by _PyUnicode_New */
}
return (PyObject *)unicode;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment