Commit f2c76aa6 authored by Victor Stinner's avatar Victor Stinner

Issue #14687: str%tuple now uses an optimistic "unicode writer" instead of an

accumulator. Directly write characters into the output (don't use a temporary
list): resize and widen the string on demand.
parent ac20f463
...@@ -10074,7 +10074,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) ...@@ -10074,7 +10074,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
switch ((kind)) { \ switch ((kind)) { \
case PyUnicode_1BYTE_KIND: { \ case PyUnicode_1BYTE_KIND: { \
unsigned char * to_ = (unsigned char *)((data)) + (start); \ unsigned char * to_ = (unsigned char *)((data)) + (start); \
memset(to_, (unsigned char)value, length); \ memset(to_, (unsigned char)value, (length)); \
break; \ break; \
} \ } \
case PyUnicode_2BYTE_KIND: { \ case PyUnicode_2BYTE_KIND: { \
...@@ -13655,56 +13655,133 @@ formatchar(PyObject *v) ...@@ -13655,56 +13655,133 @@ formatchar(PyObject *v)
return (Py_UCS4) -1; return (Py_UCS4) -1;
} }
static int struct unicode_writer_t {
repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) PyObject *buffer;
{ void *data;
int r; enum PyUnicode_Kind kind;
assert(count > 0); Py_UCS4 maxchar;
assert(PyUnicode_Check(obj)); Py_ssize_t length;
if (count > 5) { Py_ssize_t pos;
PyObject *repeated = unicode_repeat(obj, count); };
if (repeated == NULL)
return -1; Py_LOCAL_INLINE(void)
r = _PyAccu_Accumulate(acc, repeated); unicode_writer_update(struct unicode_writer_t *writer)
Py_DECREF(repeated); {
return r; writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
writer->data = PyUnicode_DATA(writer->buffer);
writer->kind = PyUnicode_KIND(writer->buffer);
}
Py_LOCAL_INLINE(int)
unicode_writer_init(struct unicode_writer_t *writer,
Py_ssize_t length, Py_UCS4 maxchar)
{
writer->pos = 0;
writer->length = length;
writer->buffer = PyUnicode_New(writer->length, maxchar);
if (writer->buffer == NULL)
return -1;
unicode_writer_update(writer);
return 0;
}
Py_LOCAL_INLINE(int)
unicode_writer_prepare(struct unicode_writer_t *writer,
Py_ssize_t length, Py_UCS4 maxchar)
{
Py_ssize_t newlen;
if (length > PY_SSIZE_T_MAX - writer->pos) {
PyErr_NoMemory();
return -1;
} }
else { newlen = writer->pos + length;
do {
if (_PyAccu_Accumulate(acc, obj)) if (newlen > writer->length && maxchar > writer->maxchar) {
return -1; PyObject *newbuffer;
} while (--count);
/* overallocate 25% to limit the number of resize */
if (newlen > PY_SSIZE_T_MAX - newlen / 4)
writer->length = newlen;
else
writer->length = newlen + newlen / 4;
/* resize + widen */
newbuffer = PyUnicode_New(writer->length, maxchar);
if (newbuffer == NULL)
return -1;
PyUnicode_CopyCharacters(newbuffer, 0,
writer->buffer, 0, writer->pos);
Py_DECREF(writer->buffer);
writer->buffer = newbuffer;
unicode_writer_update(writer);
return 0; return 0;
} }
if (newlen > writer->length) {
/* overallocate 25% to limit the number of resize */
if (newlen > PY_SSIZE_T_MAX - newlen / 4)
writer->length = newlen;
else
writer->length = newlen + newlen / 4;
if (PyUnicode_Resize(&writer->buffer, writer->length) < 0)
return -1;
unicode_writer_update(writer);
}
if (maxchar > writer->maxchar) {
if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
return -1;
unicode_writer_update(writer);
}
return 0;
}
Py_LOCAL_INLINE(int)
unicode_writer_write_str(
struct unicode_writer_t *writer,
PyObject *str, Py_ssize_t start, Py_ssize_t length)
{
Py_UCS4 maxchar;
maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
if (unicode_writer_prepare(writer, length, maxchar) == -1)
return -1;
assert((writer->pos + length) <= writer->length);
copy_characters(writer->buffer, writer->pos,
str, start, length);
writer->pos += length;
return 0;
}
Py_LOCAL_INLINE(int)
unicode_writer_write_char(
struct unicode_writer_t *writer,
Py_UCS4 ch)
{
if (unicode_writer_prepare(writer, 1, ch) == -1)
return -1;
assert((writer->pos + 1) <= writer->length);
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
writer->pos += 1;
return 0;
}
Py_LOCAL_INLINE(void)
unicode_writer_dealloc(struct unicode_writer_t *writer)
{
Py_CLEAR(writer->buffer);
} }
PyObject * PyObject *
PyUnicode_Format(PyObject *format, PyObject *args) PyUnicode_Format(PyObject *format, PyObject *args)
{ {
void *fmt;
int fmtkind;
PyObject *result;
int kind;
int r;
Py_ssize_t fmtcnt, fmtpos, arglen, argidx; Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
int args_owned = 0; int args_owned = 0;
PyObject *dict = NULL; PyObject *dict = NULL;
PyObject *temp = NULL; PyObject *temp = NULL;
PyObject *second = NULL; PyObject *second = NULL;
PyObject *uformat; PyObject *uformat;
_PyAccu acc; void *fmt;
static PyObject *plus, *minus, *blank, *zero, *percent; enum PyUnicode_Kind kind, fmtkind;
struct unicode_writer_t writer;
if (!plus && !(plus = get_latin1_char('+')))
return NULL;
if (!minus && !(minus = get_latin1_char('-')))
return NULL;
if (!blank && !(blank = get_latin1_char(' ')))
return NULL;
if (!zero && !(zero = get_latin1_char('0')))
return NULL;
if (!percent && !(percent = get_latin1_char('%')))
return NULL;
if (format == NULL || args == NULL) { if (format == NULL || args == NULL) {
PyErr_BadInternalCall(); PyErr_BadInternalCall();
...@@ -13715,13 +13792,15 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13715,13 +13792,15 @@ PyUnicode_Format(PyObject *format, PyObject *args)
return NULL; return NULL;
if (PyUnicode_READY(uformat) == -1) if (PyUnicode_READY(uformat) == -1)
Py_DECREF(uformat); Py_DECREF(uformat);
if (_PyAccu_Init(&acc))
goto onError;
fmt = PyUnicode_DATA(uformat); fmt = PyUnicode_DATA(uformat);
fmtkind = PyUnicode_KIND(uformat); fmtkind = PyUnicode_KIND(uformat);
fmtcnt = PyUnicode_GET_LENGTH(uformat); fmtcnt = PyUnicode_GET_LENGTH(uformat);
fmtpos = 0; fmtpos = 0;
if (unicode_writer_init(&writer, fmtcnt + 100, 127) < 0)
goto onError;
if (PyTuple_Check(args)) { if (PyTuple_Check(args)) {
arglen = PyTuple_Size(args); arglen = PyTuple_Size(args);
argidx = 0; argidx = 0;
...@@ -13736,7 +13815,6 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13736,7 +13815,6 @@ PyUnicode_Format(PyObject *format, PyObject *args)
while (--fmtcnt >= 0) { while (--fmtcnt >= 0) {
if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
PyObject *nonfmt;
Py_ssize_t nonfmtpos; Py_ssize_t nonfmtpos;
nonfmtpos = fmtpos++; nonfmtpos = fmtpos++;
while (fmtcnt >= 0 && while (fmtcnt >= 0 &&
...@@ -13744,12 +13822,9 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13744,12 +13822,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
fmtpos++; fmtpos++;
fmtcnt--; fmtcnt--;
} }
nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos); if (fmtcnt < 0)
if (nonfmt == NULL) fmtpos--;
goto onError; if (unicode_writer_write_str(&writer, uformat, nonfmtpos, fmtpos - nonfmtpos) < 0)
r = _PyAccu_Accumulate(&acc, nonfmt);
Py_DECREF(nonfmt);
if (r)
goto onError; goto onError;
} }
else { else {
...@@ -13758,12 +13833,13 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13758,12 +13833,13 @@ PyUnicode_Format(PyObject *format, PyObject *args)
Py_ssize_t width = -1; Py_ssize_t width = -1;
int prec = -1; int prec = -1;
Py_UCS4 c = '\0'; Py_UCS4 c = '\0';
Py_UCS4 fill, sign; Py_UCS4 fill;
int sign;
Py_UCS4 signchar;
int isnumok; int isnumok;
PyObject *v = NULL; PyObject *v = NULL;
void *pbuf = NULL; void *pbuf = NULL;
Py_ssize_t pindex, len; Py_ssize_t pindex, len;
PyObject *signobj = NULL, *fillobj = NULL;
fmtpos++; fmtpos++;
c = PyUnicode_READ(fmtkind, fmt, fmtpos); c = PyUnicode_READ(fmtkind, fmt, fmtpos);
...@@ -13906,7 +13982,8 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13906,7 +13982,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
} }
if (c == '%') { if (c == '%') {
_PyAccu_Accumulate(&acc, percent); if (unicode_writer_write_char(&writer, '%') < 0)
goto onError;
continue; continue;
} }
...@@ -13916,8 +13993,8 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13916,8 +13993,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError; goto onError;
sign = 0; sign = 0;
signchar = '\0';
fill = ' '; fill = ' ';
fillobj = blank;
switch (c) { switch (c) {
case 's': case 's':
...@@ -13972,10 +14049,8 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13972,10 +14049,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
"not %.200s", (char)c, Py_TYPE(v)->tp_name); "not %.200s", (char)c, Py_TYPE(v)->tp_name);
goto onError; goto onError;
} }
if (flags & F_ZERO) { if (flags & F_ZERO)
fill = '0'; fill = '0';
fillobj = zero;
}
break; break;
case 'e': case 'e':
...@@ -13985,10 +14060,8 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -13985,10 +14060,8 @@ PyUnicode_Format(PyObject *format, PyObject *args)
case 'g': case 'g':
case 'G': case 'G':
sign = 1; sign = 1;
if (flags & F_ZERO) { if (flags & F_ZERO)
fill = '0'; fill = '0';
fillobj = zero;
}
temp = formatfloat(v, flags, prec, c); temp = formatfloat(v, flags, prec, c);
break; break;
...@@ -14029,20 +14102,16 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -14029,20 +14102,16 @@ PyUnicode_Format(PyObject *format, PyObject *args)
/* pbuf is initialized here. */ /* pbuf is initialized here. */
pindex = 0; pindex = 0;
if (sign) { if (sign) {
if (PyUnicode_READ(kind, pbuf, pindex) == '-') { Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
signobj = minus; if (ch == '-' || ch == '+') {
len--; signchar = ch;
pindex++;
}
else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
signobj = plus;
len--; len--;
pindex++; pindex++;
} }
else if (flags & F_SIGN) else if (flags & F_SIGN)
signobj = plus; signchar = '+';
else if (flags & F_BLANK) else if (flags & F_BLANK)
signobj = blank; signchar = ' ';
else else
sign = 0; sign = 0;
} }
...@@ -14050,8 +14119,7 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -14050,8 +14119,7 @@ PyUnicode_Format(PyObject *format, PyObject *args)
width = len; width = len;
if (sign) { if (sign) {
if (fill != ' ') { if (fill != ' ') {
assert(signobj != NULL); if (unicode_writer_write_char(&writer, signchar) < 0)
if (_PyAccu_Accumulate(&acc, signobj))
goto onError; goto onError;
} }
if (width > len) if (width > len)
...@@ -14061,14 +14129,12 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -14061,14 +14129,12 @@ PyUnicode_Format(PyObject *format, PyObject *args)
assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
if (fill != ' ') { if (fill != ' ') {
second = get_latin1_char( if (unicode_writer_prepare(&writer, 2, 127) < 0)
PyUnicode_READ(kind, pbuf, pindex + 1));
pindex += 2;
if (second == NULL ||
_PyAccu_Accumulate(&acc, zero) ||
_PyAccu_Accumulate(&acc, second))
goto onError; goto onError;
Py_CLEAR(second); PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
writer.pos += 2;
pindex += 2;
} }
width -= 2; width -= 2;
if (width < 0) if (width < 0)
...@@ -14076,45 +14142,43 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -14076,45 +14142,43 @@ PyUnicode_Format(PyObject *format, PyObject *args)
len -= 2; len -= 2;
} }
if (width > len && !(flags & F_LJUST)) { if (width > len && !(flags & F_LJUST)) {
assert(fillobj != NULL); Py_ssize_t sublen;
if (repeat_accumulate(&acc, fillobj, width - len)) sublen = width - len;
if (unicode_writer_prepare(&writer, sublen, fill) < 0)
goto onError; goto onError;
FILL(writer.kind, writer.data, fill, writer.pos, sublen);
writer.pos += sublen;
width = len; width = len;
} }
if (fill == ' ') { if (fill == ' ') {
if (sign) { if (sign) {
assert(signobj != NULL); if (unicode_writer_write_char(&writer, signchar) < 0)
if (_PyAccu_Accumulate(&acc, signobj))
goto onError; goto onError;
} }
if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
second = get_latin1_char(
PyUnicode_READ(kind, pbuf, pindex + 1)); if (unicode_writer_prepare(&writer, 2, 127) < 0)
pindex += 2;
if (second == NULL ||
_PyAccu_Accumulate(&acc, zero) ||
_PyAccu_Accumulate(&acc, second))
goto onError; goto onError;
Py_CLEAR(second); PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
writer.pos += 2;
pindex += 2;
} }
} }
/* Copy all characters, preserving len */ /* Copy all characters, preserving len */
if (pindex == 0 && len == PyUnicode_GET_LENGTH(temp)) { if (unicode_writer_write_str(&writer, temp, pindex, len) < 0)
r = _PyAccu_Accumulate(&acc, temp); goto onError;
} if (width > len) {
else { Py_ssize_t sublen = width - len;
v = PyUnicode_Substring(temp, pindex, pindex + len); if (unicode_writer_prepare(&writer, sublen, ' ') < 0)
if (v == NULL)
goto onError; goto onError;
r = _PyAccu_Accumulate(&acc, v); FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
Py_DECREF(v); writer.pos += sublen;
} }
if (r)
goto onError;
if (width > len && repeat_accumulate(&acc, blank, width - len))
goto onError;
if (dict && (argidx < arglen) && c != '%') { if (dict && (argidx < arglen) && c != '%') {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"not all arguments converted during string formatting"); "not all arguments converted during string formatting");
...@@ -14129,20 +14193,22 @@ PyUnicode_Format(PyObject *format, PyObject *args) ...@@ -14129,20 +14193,22 @@ PyUnicode_Format(PyObject *format, PyObject *args)
goto onError; goto onError;
} }
result = _PyAccu_Finish(&acc); if (PyUnicode_Resize(&writer.buffer, writer.pos) < 0)
goto onError;
if (args_owned) { if (args_owned) {
Py_DECREF(args); Py_DECREF(args);
} }
Py_DECREF(uformat); Py_DECREF(uformat);
Py_XDECREF(temp); Py_XDECREF(temp);
Py_XDECREF(second); Py_XDECREF(second);
return result; return writer.buffer;
onError: onError:
Py_DECREF(uformat); Py_DECREF(uformat);
Py_XDECREF(temp); Py_XDECREF(temp);
Py_XDECREF(second); Py_XDECREF(second);
_PyAccu_Destroy(&acc); unicode_writer_dealloc(&writer);
if (args_owned) { if (args_owned) {
Py_DECREF(args); Py_DECREF(args);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment