Commit ab868311 authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The

common cases are optimized thanks to a dedicated fast path and a moderate
amount of loop unrolling.

This will especially help text I/O (we already register a 30% speedup on large
reads on the io-c branch).
parent dd6351e6
...@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0 ...@@ -12,6 +12,10 @@ What's New in Python 3.1 alpha 0
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #4868: utf-8, utf-16 and latin1 decoding are now 2x to 4x faster. The
common cases are optimized thanks to a dedicated fast path and a moderate
amount of loop unrolling.
- Issue #4074: Change the criteria for doing a full garbage collection (i.e. - Issue #4074: Change the criteria for doing a full garbage collection (i.e.
collecting the oldest generation) so that allocating lots of objects without collecting the oldest generation) so that allocating lots of objects without
destroying them does not show quadratic performance. Based on a proposal by destroying them does not show quadratic performance. Based on a proposal by
......
...@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ...@@ -2001,6 +2001,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
} }
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
/* Mask to quickly check whether a C 'long' contains a
non-ASCII, UTF8-encoded char. */
#if (SIZEOF_LONG == 8)
# define ASCII_CHAR_MASK 0x8080808080808080L
#elif (SIZEOF_LONG == 4)
# define ASCII_CHAR_MASK 0x80808080L
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size, Py_ssize_t size,
const char *errors, const char *errors,
...@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -2011,7 +2024,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos; Py_ssize_t startinpos;
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
const char *e; const char *e, *aligned_end;
PyUnicodeObject *unicode; PyUnicodeObject *unicode;
Py_UNICODE *p; Py_UNICODE *p;
const char *errmsg = ""; const char *errmsg = "";
...@@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -2032,10 +2045,51 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
/* Unpack UTF-8 encoded data */ /* Unpack UTF-8 encoded data */
p = unicode->str; p = unicode->str;
e = s + size; e = s + size;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
while (s < e) { while (s < e) {
Py_UCS4 ch = (unsigned char)*s; Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
/* Fast path for runs of ASCII characters. Given that common UTF-8
input will consist of an overwhelming majority of ASCII
characters, we try to optimize for this case by checking
as many characters as a C 'long' can contain.
First, check if we can do an aligned read, as most CPUs have
a penalty for unaligned reads.
*/
if (!((size_t) s & LONG_PTR_MASK)) {
/* Help register allocation */
register const char *_s = s;
register Py_UNICODE *_p = p;
while (_s < aligned_end) {
/* Read a whole long at a time (either 4 or 8 bytes),
and do a fast unrolled copy if it only contains ASCII
characters. */
unsigned long data = *(unsigned long *) _s;
if (data & ASCII_CHAR_MASK)
break;
_p[0] = (unsigned char) _s[0];
_p[1] = (unsigned char) _s[1];
_p[2] = (unsigned char) _s[2];
_p[3] = (unsigned char) _s[3];
#if (SIZEOF_LONG == 8)
_p[4] = (unsigned char) _s[4];
_p[5] = (unsigned char) _s[5];
_p[6] = (unsigned char) _s[6];
_p[7] = (unsigned char) _s[7];
#endif
_s += SIZEOF_LONG;
_p += SIZEOF_LONG;
}
s = _s;
p = _p;
if (s == e)
break;
ch = (unsigned char)*s;
}
}
if (ch < 0x80) { if (ch < 0x80) {
*p++ = (Py_UNICODE)ch; *p++ = (Py_UNICODE)ch;
s++; s++;
...@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -2169,6 +2223,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p)) &unicode, &outpos, &p))
goto onError; goto onError;
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
} }
if (consumed) if (consumed)
*consumed = s-starts; *consumed = s-starts;
...@@ -2188,6 +2243,9 @@ onError: ...@@ -2188,6 +2243,9 @@ onError:
return NULL; return NULL;
} }
#undef ASCII_CHAR_MASK
/* Allocation strategy: if the string is short, convert into a stack buffer /* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the and allocate exactly as much space needed at the end. Else allocate the
maximum possible needed (4 result bytes per Unicode character), and return maximum possible needed (4 result bytes per Unicode character), and return
...@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s, ...@@ -2582,6 +2640,23 @@ PyUnicode_DecodeUTF16(const char *s,
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
} }
/* Two masks for fast checking of whether a C 'long' may contain
UTF16-encoded surrogate characters. This is an efficient heuristic,
assuming that non-surrogate characters with a code point >= 0x8000 are
rare in most input.
FAST_CHAR_MASK is used when the input is in native byte ordering,
SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
*/
#if (SIZEOF_LONG == 8)
# define FAST_CHAR_MASK 0x8000800080008000L
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
#elif (SIZEOF_LONG == 4)
# define FAST_CHAR_MASK 0x80008000L
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
#else
# error C 'long' size should be either 4 or 8!
#endif
PyObject * PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s, PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t size, Py_ssize_t size,
...@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -2595,8 +2670,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
PyUnicodeObject *unicode; PyUnicodeObject *unicode;
Py_UNICODE *p; Py_UNICODE *p;
const unsigned char *q, *e; const unsigned char *q, *e, *aligned_end;
int bo = 0; /* assume native ordering by default */ int bo = 0; /* assume native ordering by default */
int native_ordering = 0;
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */ /* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN #ifdef BYTEORDER_IS_LITTLE_ENDIAN
...@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -2618,7 +2694,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Unpack UTF-16 encoded data */ /* Unpack UTF-16 encoded data */
p = unicode->str; p = unicode->str;
q = (unsigned char *)s; q = (unsigned char *)s;
e = q + size; e = q + size - 1;
if (byteorder) if (byteorder)
bo = *byteorder; bo = *byteorder;
...@@ -2662,19 +2738,77 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -2662,19 +2738,77 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
ihi = 0; ihi = 0;
ilo = 1; ilo = 1;
} }
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
native_ordering = ilo < ihi;
#else
native_ordering = ilo > ihi;
#endif
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
while (q < e) { while (q < e) {
Py_UNICODE ch; Py_UNICODE ch;
/* remaining bytes at the end? (size should be even) */ /* First check for possible aligned read of a C 'long'. Unaligned
if (e-q<2) { reads are more expensive, better to defer to another iteration. */
if (consumed) if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of non-surrogate chars. */
register const unsigned char *_q = q;
Py_UNICODE *_p = p;
if (native_ordering) {
/* Native ordering is simple: as long as the input cannot
possibly contain a surrogate char, do an unrolled copy
of several 16-bit code points to the target object.
The non-surrogate check is done on several input bytes
at a time (as many as a C 'long' can contain). */
while (_q < aligned_end) {
unsigned long data = * (unsigned long *) _q;
if (data & FAST_CHAR_MASK)
break;
_p[0] = ((unsigned short *) _q)[0];
_p[1] = ((unsigned short *) _q)[1];
#if (SIZEOF_LONG == 8)
_p[2] = ((unsigned short *) _q)[2];
_p[3] = ((unsigned short *) _q)[3];
#endif
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
}
}
else {
/* Byteswapped ordering is similar, but we must decompose
the copy bytewise, and take care of zero'ing out the
upper bytes if the target object is in 32-bit units
(that is, in UCS-4 builds). */
while (_q < aligned_end) {
unsigned long data = * (unsigned long *) _q;
if (data & SWAPPED_FAST_CHAR_MASK)
break;
/* Zero upper bytes in UCS-4 builds */
#if (Py_UNICODE_SIZE > 2)
_p[0] = 0;
_p[1] = 0;
#if (SIZEOF_LONG == 8)
_p[2] = 0;
_p[3] = 0;
#endif
#endif
((unsigned char *) _p)[1] = _q[0];
((unsigned char *) _p)[0] = _q[1];
((unsigned char *) _p)[1 + Py_UNICODE_SIZE] = _q[2];
((unsigned char *) _p)[0 + Py_UNICODE_SIZE] = _q[3];
#if (SIZEOF_LONG == 8)
((unsigned char *) _p)[1 + 2 * Py_UNICODE_SIZE] = _q[4];
((unsigned char *) _p)[0 + 2 * Py_UNICODE_SIZE] = _q[5];
((unsigned char *) _p)[1 + 3 * Py_UNICODE_SIZE] = _q[6];
((unsigned char *) _p)[0 + 3 * Py_UNICODE_SIZE] = _q[7];
#endif
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
}
}
p = _p;
q = _q;
if (q >= e)
break; break;
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
endinpos = ((const char *)e)-starts;
goto utf16Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
} }
ch = (q[ihi] << 8) | q[ilo]; ch = (q[ihi] << 8) | q[ilo];
...@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -2686,10 +2820,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
} }
/* UTF-16 code pair: */ /* UTF-16 code pair: */
if (q >= e) { if (q > e) {
errmsg = "unexpected end of data"; errmsg = "unexpected end of data";
startinpos = (((const char *)q)-2)-starts; startinpos = (((const char *)q) - 2) - starts;
endinpos = ((const char *)e)-starts; endinpos = ((const char *)e) + 1 - starts;
goto utf16Error; goto utf16Error;
} }
if (0xD800 <= ch && ch <= 0xDBFF) { if (0xD800 <= ch && ch <= 0xDBFF) {
...@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -2718,14 +2852,47 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Fall through to report the error */ /* Fall through to report the error */
utf16Error: utf16Error:
outpos = p-PyUnicode_AS_UNICODE(unicode); outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors,
&errorHandler,
"utf16", errmsg, "utf16", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, &starts,
&unicode, &outpos, &p)) (const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos,
&p))
goto onError; goto onError;
} }
/* remaining byte at the end? (size should be even) */
if (e == q) {
if (!consumed) {
errmsg = "truncated data";
startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) + 1 - starts;
outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors,
&errorHandler,
"utf16", errmsg,
&starts,
(const char **)&e,
&startinpos,
&endinpos,
&exc,
(const char **)&q,
&unicode,
&outpos,
&p))
goto onError;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
}
if (byteorder) if (byteorder)
*byteorder = bo; *byteorder = bo;
...@@ -2748,6 +2915,9 @@ onError: ...@@ -2748,6 +2915,9 @@ onError:
return NULL; return NULL;
} }
#undef FAST_CHAR_MASK
#undef SWAPPED_FAST_CHAR_MASK
PyObject * PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s, PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Py_ssize_t size, Py_ssize_t size,
...@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s, ...@@ -3571,6 +3741,7 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
Py_UNICODE *p; Py_UNICODE *p;
const char *e, *unrolled_end;
/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
if (size == 1) { if (size == 1) {
...@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s, ...@@ -3584,8 +3755,20 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,
if (size == 0) if (size == 0)
return (PyObject *)v; return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v); p = PyUnicode_AS_UNICODE(v);
while (size-- > 0) e = s + size;
*p++ = (unsigned char)*s++; /* Unrolling the copy makes it much faster by reducing the looping
overhead. This is similar to what many memcpy() implementations do. */
unrolled_end = e - 4;
while (s < unrolled_end) {
p[0] = (unsigned char) s[0];
p[1] = (unsigned char) s[1];
p[2] = (unsigned char) s[2];
p[3] = (unsigned char) s[3];
s += 4;
p += 4;
}
while (s < e)
*p++ = (unsigned char) *s++;
return (PyObject *)v; return (PyObject *)v;
onError: onError:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment