Commit b9275c10 authored by Victor Stinner's avatar Victor Stinner

Speedup str[a:b] and PyUnicode_FromKindAndData

 * str[a:b] doesn't scan the string for the maximum character if the string
   is ascii only
 * PyUnicode_FromKindAndData() stops if we are sure that we cannot use a
   shorter character type. For example, _PyUnicode_FromUCS1() stops if we
   have at least one character in range U+0080-U+00FF
parent 702c7343
...@@ -654,6 +654,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromString( ...@@ -654,6 +654,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromString(
const char *u /* UTF-8 encoded string */ const char *u /* UTF-8 encoded string */
); );
/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
Scan the string to find the maximum character. */
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
int kind, int kind,
......
...@@ -969,7 +969,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -969,7 +969,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
if (from_kind == to_kind if (from_kind == to_kind
/* deny latin1 => ascii */ /* deny latin1 => ascii */
&& PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from)) && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
{ {
Py_MEMCPY((char*)to_data Py_MEMCPY((char*)to_data
+ PyUnicode_KIND_SIZE(to_kind, to_start), + PyUnicode_KIND_SIZE(to_kind, to_start),
...@@ -1013,9 +1013,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ...@@ -1013,9 +1013,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
/* check if max_char(from substring) <= max_char(to) */ /* check if max_char(from substring) <= max_char(to) */
if (from_kind > to_kind if (from_kind > to_kind
/* latin1 => ascii */ /* latin1 => ascii */
|| (PyUnicode_IS_ASCII(to) || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
&& to_kind == PyUnicode_1BYTE_KIND
&& !PyUnicode_IS_ASCII(from)))
{ {
/* slow path to check for character overflow */ /* slow path to check for character overflow */
const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
...@@ -1528,15 +1526,17 @@ static PyObject* ...@@ -1528,15 +1526,17 @@ static PyObject*
_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
{ {
PyObject *res; PyObject *res;
unsigned char max = 127; unsigned char max_char = 127;
Py_ssize_t i; Py_ssize_t i;
assert(size >= 0);
for (i = 0; i < size; i++) { for (i = 0; i < size; i++) {
if (u[i] & 0x80) { if (u[i] & 0x80) {
max = 255; max_char = 255;
break; break;
} }
} }
res = PyUnicode_New(size, max); res = PyUnicode_New(size, max_char);
if (!res) if (!res)
return NULL; return NULL;
memcpy(PyUnicode_1BYTE_DATA(res), u, size); memcpy(PyUnicode_1BYTE_DATA(res), u, size);
...@@ -1547,15 +1547,21 @@ static PyObject* ...@@ -1547,15 +1547,21 @@ static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
{ {
PyObject *res; PyObject *res;
Py_UCS2 max = 0; Py_UCS2 max_char = 0;
Py_ssize_t i; Py_ssize_t i;
for (i = 0; i < size; i++)
if (u[i] > max) assert(size >= 0);
max = u[i]; for (i = 0; i < size; i++) {
res = PyUnicode_New(size, max); if (u[i] > max_char) {
max_char = u[i];
if (max_char >= 256)
break;
}
}
res = PyUnicode_New(size, max_char);
if (!res) if (!res)
return NULL; return NULL;
if (max >= 256) if (max_char >= 256)
memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
else else
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
...@@ -1567,15 +1573,21 @@ static PyObject* ...@@ -1567,15 +1573,21 @@ static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
{ {
PyObject *res; PyObject *res;
Py_UCS4 max = 0; Py_UCS4 max_char = 0;
Py_ssize_t i; Py_ssize_t i;
for (i = 0; i < size; i++)
if (u[i] > max) assert(size >= 0);
max = u[i]; for (i = 0; i < size; i++) {
res = PyUnicode_New(size, max); if (u[i] > max_char) {
max_char = u[i];
if (max_char >= 0x10000)
break;
}
}
res = PyUnicode_New(size, max_char);
if (!res) if (!res)
return NULL; return NULL;
if (max >= 0x10000) if (max_char >= 0x10000)
memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
else { else {
int kind = PyUnicode_KIND(res); int kind = PyUnicode_KIND(res);
...@@ -1596,9 +1608,11 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) ...@@ -1596,9 +1608,11 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
return _PyUnicode_FromUCS2(buffer, size); return _PyUnicode_FromUCS2(buffer, size);
case PyUnicode_4BYTE_KIND: case PyUnicode_4BYTE_KIND:
return _PyUnicode_FromUCS4(buffer, size); return _PyUnicode_FromUCS4(buffer, size);
default:
assert(0 && "invalid kind");
PyErr_SetString(PyExc_SystemError, "invalid kind");
return NULL;
} }
PyErr_SetString(PyExc_SystemError, "invalid kind");
return NULL;
} }
PyObject* PyObject*
...@@ -9383,11 +9397,12 @@ replace(PyObject *self, PyObject *str1, ...@@ -9383,11 +9397,12 @@ replace(PyObject *self, PyObject *str1,
maxchar = PyUnicode_MAX_CHAR_VALUE(self); maxchar = PyUnicode_MAX_CHAR_VALUE(self);
/* Replacing u1 with u2 may cause a maxchar reduction in the /* Replacing u1 with u2 may cause a maxchar reduction in the
result string. */ result string. */
mayshrink = maxchar > 127;
if (u2 > maxchar) { if (u2 > maxchar) {
maxchar = u2; maxchar = u2;
mayshrink = 0; mayshrink = 0;
} }
else
mayshrink = maxchar > 127;
u = PyUnicode_New(slen, maxchar); u = PyUnicode_New(slen, maxchar);
if (!u) if (!u)
goto error; goto error;
...@@ -11039,11 +11054,18 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) ...@@ -11039,11 +11054,18 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
return NULL; return NULL;
} }
kind = PyUnicode_KIND(self); if (PyUnicode_IS_ASCII(self)) {
data = PyUnicode_1BYTE_DATA(self); kind = PyUnicode_KIND(self);
return PyUnicode_FromKindAndData(kind, data = PyUnicode_1BYTE_DATA(self);
data + PyUnicode_KIND_SIZE(kind, start), return unicode_fromascii(data + start, length);
length); }
else {
kind = PyUnicode_KIND(self);
data = PyUnicode_1BYTE_DATA(self);
return PyUnicode_FromKindAndData(kind,
data + PyUnicode_KIND_SIZE(kind, start),
length);
}
} }
static PyObject * static PyObject *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment