Commit 770847a7 authored by Inada Naoki's avatar Inada Naoki Committed by GitHub

bpo-37348: optimize decoding ASCII string (GH-14283)

`_PyUnicode_Writer` is a relatively complex structure.  Initializing it is significant overhead when decoding short ASCII string.
parent b3ca7972
Optimized decoding short ASCII string with UTF-8 and ascii codecs.
``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki.
...@@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, ...@@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
/* Forward declaration */ /* Forward declaration */
static inline int static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
static PyObject * static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors); const char *errors);
...@@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, ...@@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors, _Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed) Py_ssize_t *consumed)
{ {
_PyUnicodeWriter writer;
const char *starts = s;
const char *end = s + size;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
if (size == 0) { if (size == 0) {
if (consumed) if (consumed)
*consumed = 0; *consumed = 0;
...@@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, ...@@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]); return get_latin1_char((unsigned char)s[0]);
} }
_PyUnicodeWriter_Init(&writer); const char *starts = s;
writer.min_length = size; const char *end = s + size;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
goto onError; // fast path: try ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
s += ascii_decode(s, end, PyUnicode_DATA(u));
if (s == end) {
return u;
}
// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = s - starts;
Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
writer.pos = ascii_decode(s, end, writer.data);
s += writer.pos;
while (s < end) { while (s < end) {
Py_UCS4 ch; Py_UCS4 ch;
int kind = writer.kind; int kind = writer.kind;
...@@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s, ...@@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s,
const char *errors) const char *errors)
{ {
const char *starts = s; const char *starts = s;
_PyUnicodeWriter writer; const char *e = s + size;
int kind;
void *data;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
PyObject *error_handler_obj = NULL; PyObject *error_handler_obj = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
...@@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s, ...@@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128) if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]); return get_latin1_char((unsigned char)s[0]);
_PyUnicodeWriter_Init(&writer); // Shortcut for simple case
writer.min_length = size; PyObject *u = PyUnicode_New(size, 127);
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) if (u == NULL) {
return NULL; return NULL;
}
Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
if (outpos == size) {
return u;
}
e = s + size; _PyUnicodeWriter writer;
data = writer.data; _PyUnicodeWriter_InitWithBuffer(&writer, u);
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
writer.pos = outpos; writer.pos = outpos;
if (writer.pos == size)
return _PyUnicodeWriter_Finish(&writer);
s += writer.pos; s += outpos;
kind = writer.kind; int kind = writer.kind;
void *data = writer.data;
Py_ssize_t startinpos, endinpos;
while (s < e) { while (s < e) {
unsigned char c = (unsigned char)*s; unsigned char c = (unsigned char)*s;
if (c < 128) { if (c < 128) {
...@@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) ...@@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
assert(writer->kind <= PyUnicode_1BYTE_KIND); assert(writer->kind <= PyUnicode_1BYTE_KIND);
} }
// Initialize _PyUnicodeWriter with initial buffer
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{
memset(writer, 0, sizeof(*writer));
writer->buffer = buffer;
_PyUnicodeWriter_Update(writer);
writer->min_length = writer->size;
}
int int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar) Py_ssize_t length, Py_UCS4 maxchar)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment