Commit b3316ece authored by Victor Stinner's avatar Victor Stinner

Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster

Patch written by Serhiy Storchaka.
parent 17dfab1b
...@@ -157,7 +157,7 @@ Optimizations ...@@ -157,7 +157,7 @@ Optimizations
Major performance enhancements have been added: Major performance enhancements have been added:
* None yet. * The UTF-32 decoder is now 3x to 4x faster.
Build and C API Changes Build and C API Changes
......
...@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1? ...@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
written by Serhiy Storchaka.
- Issue #16197: Update winreg docstrings and documentation to match code. - Issue #16197: Update winreg docstrings and documentation to match code.
Patch by Zachary Ware. Patch by Zachary Ware.
......
...@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
const unsigned char *q, *e; const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */ int le, bo = 0; /* assume native ordering by default */
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving bytes in the right order. */
#if PY_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
...@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
byte order setting accordingly. In native mode, the leading BOM byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */ stream as-is (giving a ZWNBSP character). */
if (bo == 0) { if (bo == 0 && size >= 4) {
if (size >= 4) { Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | if (bom == 0x0000FEFF) {
(q[iorder[1]] << 8) | q[iorder[0]]; bo = -1;
#if PY_LITTLE_ENDIAN q += 4;
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
} }
else if (bom == 0xFFFE0000) {
bo = 1;
q += 4;
}
if (byteorder)
*byteorder = bo;
} }
if (bo == -1) { if (q == e) {
/* force LE */ if (consumed)
iorder[0] = 0; *consumed = size;
iorder[1] = 1; Py_INCREF(unicode_empty);
iorder[2] = 2; return unicode_empty;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
} }
/* This might be one to much, because of a BOM */ #ifdef WORDS_BIGENDIAN
unicode = PyUnicode_New((size+3)/4, 127); le = bo < 0;
#else
le = bo <= 0;
#endif
unicode = PyUnicode_New((e - q + 3) / 4, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0)
return unicode;
outpos = 0; outpos = 0;
while (1) {
Py_UCS4 ch = 0;
Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
if (e - q >= 4) {
enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
void *data = PyUnicode_DATA(unicode);
const unsigned char *last = e - 4;
if (le) {
do {
ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
else {
do {
ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
if (ch > maxch)
break;
PyUnicode_WRITE(kind, data, outpos++, ch);
q += 4;
} while (q <= last);
}
}
while (q < e) { if (ch <= maxch) {
Py_UCS4 ch; if (q == e || consumed)
/* remaining bytes at the end? (size should be divisible by 4) */
if (e-q<4) {
if (consumed)
break; break;
/* remaining bytes at the end? (size should be divisible by 4) */
errmsg = "truncated data"; errmsg = "truncated data";
startinpos = ((const char *)q)-starts; startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e)-starts; endinpos = ((const char *)e) - starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
} }
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | else {
(q[iorder[1]] << 8) | q[iorder[0]]; if (ch < 0x110000) {
if (unicode_putchar(&unicode, &outpos, ch) < 0)
if (ch >= 0x110000) goto onError;
{ q += 4;
continue;
}
errmsg = "codepoint not in range(0x110000)"; errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts; startinpos = ((const char *)q) - starts;
endinpos = startinpos+4; endinpos = startinpos + 4;
goto utf32Error;
} }
if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError; /* The remaining input chars are ignored if the callback
q += 4; chooses to skip the input */
continue;
utf32Error:
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf32", errmsg, "utf32", errmsg,
...@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
goto onError; goto onError;
} }
if (byteorder)
*byteorder = bo;
if (consumed) if (consumed)
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment