bpo-33954: Fix _PyUnicode_InsertThousandsGrouping() (GH-10623)

Fix str.format(), float.__format__() and complex.__format__() methods for non-ASCII decimal point when using the "n" formatter. Changes: * Rewrite _PyUnicode_InsertThousandsGrouping(): it now requires a _PyUnicodeWriter object for the buffer and a Python str object for digits. * Rename FILL() macro to unicode_fill(), convert it to static inline function, add "assert(0 <= start);" and rework its code.

bpo-33954: Fix _PyUnicode_InsertThousandsGrouping() (GH-10623)
Fix str.format(), float.__format__() and complex.__format__() methods for non-ASCII decimal point when using the "n" formatter. Changes: * Rewrite _PyUnicode_InsertThousandsGrouping(): it now requires a _PyUnicodeWriter object for the buffer and a Python str object for digits. * Rename FILL() macro to unicode_fill(), convert it to static inline function, add "assert(0 <= start);" and rework its code.
59423e3d · Victor Stinner · GitHub · df108dc6 · 59423e3d · 59423e3d
Commit 59423e3d authored Nov 26, 2018 by Victor Stinner Committed by GitHub Nov 26, 2018
5 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -2135,10 +2135,10 @@ PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
   see Objects/stringlib/localeutil.h */
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
-    PyObject *unicode,
-    Py_ssize_t index,
+    _PyUnicodeWriter *writer,
    Py_ssize_t n_buffer,
-    void *digits,
+    PyObject *digits,
+    Py_ssize_t d_pos,
    Py_ssize_t n_digits,
    Py_ssize_t min_width,
    const char *grouping,

--- a/Misc/NEWS.d/next/Core and Builtins/2018-11-20-22-33-38.bpo-33954.RzSngM.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-11-20-22-33-38.bpo-33954.RzSngM.rst
+For :meth:`str.format`, :meth:`float.__format__` and
+:meth:`complex.__format__` methods for non-ASCII decimal point when using
+the "n" formatter.
--- a/Objects/stringlib/localeutil.h
+++ b/Objects/stringlib/localeutil.h
-/* stringlib: locale related helpers implementation */
-
-#include <locale.h>
-
-#if !STRINGLIB_IS_UNICODE
-#   error "localeutil.h is specific to Unicode"
-#endif
+/* _PyUnicode_InsertThousandsGrouping() helper functions */

 typedef struct {
    const char *grouping;
    char previous;
    Py_ssize_t i; /* Where we're currently pointing in grouping. */
-} STRINGLIB(GroupGenerator);
+} GroupGenerator;
+

 static void
-STRINGLIB(GroupGenerator_init)(STRINGLIB(GroupGenerator) *self, const char *grouping)
+GroupGenerator_init(GroupGenerator *self, const char *grouping)
 {
    self->grouping = grouping;
    self->i = 0;
    self->previous = 0;
 }

+
 /* Returns the next grouping, or 0 to signify end. */
 static Py_ssize_t
-STRINGLIB(GroupGenerator_next)(STRINGLIB(GroupGenerator) *self)
+GroupGenerator_next(GroupGenerator *self)
 {
    /* Note that we don't really do much error checking here. If a
       grouping string contains just CHAR_MAX, for example, then just
@@ -43,138 +39,44 @@ STRINGLIB(GroupGenerator_next)(STRINGLIB(GroupGenerator) *self)
    }
 }

+
 /* Fill in some digits, leading zeros, and thousands separator. All
   are optional, depending on when we're called. */
 static void
-STRINGLIB(fill)(STRINGLIB_CHAR **digits_end, STRINGLIB_CHAR **buffer_end,
-     Py_ssize_t n_chars, Py_ssize_t n_zeros, STRINGLIB_CHAR* thousands_sep,
-     Py_ssize_t thousands_sep_len)
+InsertThousandsGrouping_fill(_PyUnicodeWriter *writer, Py_ssize_t *buffer_pos,
+                             PyObject *digits, Py_ssize_t *digits_pos,
+                             Py_ssize_t n_chars, Py_ssize_t n_zeros,
+                             PyObject *thousands_sep, Py_ssize_t thousands_sep_len,
+                             Py_UCS4 *maxchar)
 {
-    Py_ssize_t i;
+    if (!writer) {
+        /* if maxchar > 127, maxchar is already set */
+        if (*maxchar == 127 && thousands_sep) {
+            Py_UCS4 maxchar2 = PyUnicode_MAX_CHAR_VALUE(thousands_sep);
+            *maxchar = Py_MAX(*maxchar, maxchar2);
+        }
+        return;
+    }

    if (thousands_sep) {
-        *buffer_end -= thousands_sep_len;
+        *buffer_pos -= thousands_sep_len;

        /* Copy the thousands_sep chars into the buffer. */
-        memcpy(*buffer_end, thousands_sep,
-               thousands_sep_len * STRINGLIB_SIZEOF_CHAR);
-    }
-
-    *buffer_end -= n_chars;
-    *digits_end -= n_chars;
-    memcpy(*buffer_end, *digits_end, n_chars * sizeof(STRINGLIB_CHAR));
-
-    *buffer_end -= n_zeros;
-    for (i = 0; i < n_zeros; i++)
-        (*buffer_end)[i] = '0';
-}
-
-/**
- * InsertThousandsGrouping:
- * @buffer: A pointer to the start of a string.
- * @n_buffer: Number of characters in @buffer.
- * @digits: A pointer to the digits we're reading from. If count
- *          is non-NULL, this is unused.
- * @n_digits: The number of digits in the string, in which we want
- *            to put the grouping chars.
- * @min_width: The minimum width of the digits in the output string.
- *             Output will be zero-padded on the left to fill.
- * @grouping: see definition in localeconv().
- * @thousands_sep: see definition in localeconv().
- *
- * There are 2 modes: counting and filling. If @buffer is NULL,
- *  we are in counting mode, else filling mode.
- * If counting, the required buffer size is returned.
- * If filling, we know the buffer will be large enough, so we don't
- *  need to pass in the buffer size.
- * Inserts thousand grouping characters (as defined by grouping and
- *  thousands_sep) into the string between buffer and buffer+n_digits.
- *
- * Return value: 0 on error, else 1.  Note that no error can occur if
- *  count is non-NULL.
- *
- * This name won't be used, the includer of this file should define
- *  it to be the actual function name, based on unicode or string.
- *
- * As closely as possible, this code mimics the logic in decimal.py's
-    _insert_thousands_sep().
- **/
-static Py_ssize_t
-STRINGLIB(InsertThousandsGrouping)(
-    STRINGLIB_CHAR *buffer,
-    Py_ssize_t n_buffer,
-    STRINGLIB_CHAR *digits,
-    Py_ssize_t n_digits,
-    Py_ssize_t min_width,
-    const char *grouping,
-    STRINGLIB_CHAR *thousands_sep,
-    Py_ssize_t thousands_sep_len)
-{
-    Py_ssize_t count = 0;
-    Py_ssize_t n_zeros;
-    int loop_broken = 0;
-    int use_separator = 0; /* First time through, don't append the
-                              separator. They only go between
-                              groups. */
-    STRINGLIB_CHAR *buffer_end = NULL;
-    STRINGLIB_CHAR *digits_end = NULL;
-    Py_ssize_t l;
-    Py_ssize_t n_chars;
-    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
-                                        be looked at */
-    /* A generator that returns all of the grouping widths, until it
-       returns 0. */
-    STRINGLIB(GroupGenerator) groupgen;
-    STRINGLIB(GroupGenerator_init)(&groupgen, grouping);
-
-    if (buffer) {
-        buffer_end = buffer + n_buffer;
-        digits_end = digits + n_digits;
-    }
-
-    while ((l = STRINGLIB(GroupGenerator_next)(&groupgen)) > 0) {
-        l = Py_MIN(l, Py_MAX(Py_MAX(remaining, min_width), 1));
-        n_zeros = Py_MAX(0, l - remaining);
-        n_chars = Py_MAX(0, Py_MIN(remaining, l));
-
-        /* Use n_zero zero's and n_chars chars */
-
-        /* Count only, don't do anything. */
-        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
-
-        if (buffer) {
-            /* Copy into the output buffer. */
-            STRINGLIB(fill)(&digits_end, &buffer_end, n_chars, n_zeros,
-                 use_separator ? thousands_sep : NULL, thousands_sep_len);
-        }
-
-        /* Use a separator next time. */
-        use_separator = 1;
-
-        remaining -= n_chars;
-        min_width -= l;
-
-        if (remaining <= 0 && min_width <= 0) {
-            loop_broken = 1;
-            break;
-        }
-        min_width -= thousands_sep_len;
+        _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos,
+                                      thousands_sep, 0,
+                                      thousands_sep_len);
    }
-    if (!loop_broken) {
-        /* We left the loop without using a break statement. */

-        l = Py_MAX(Py_MAX(remaining, min_width), 1);
-        n_zeros = Py_MAX(0, l - remaining);
-        n_chars = Py_MAX(0, Py_MIN(remaining, l));
-
-        /* Use n_zero zero's and n_chars chars */
-        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
-        if (buffer) {
-            /* Copy into the output buffer. */
-            STRINGLIB(fill)(&digits_end, &buffer_end, n_chars, n_zeros,
-                 use_separator ? thousands_sep : NULL, thousands_sep_len);
-        }
+    *buffer_pos -= n_chars;
+    *digits_pos -= n_chars;
+    _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos,
+                                  digits, *digits_pos,
+                                  n_chars);
+
+    if (n_zeros) {
+        *buffer_pos -= n_zeros;
+        enum PyUnicode_Kind kind = PyUnicode_KIND(writer->buffer);
+        void *data = PyUnicode_DATA(writer->buffer);
+        unicode_fill(kind, data, '0', *buffer_pos, n_zeros);
    }
-    return count;
 }
-
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
--- a/Python/formatter_unicode.c
+++ b/Python/formatter_unicode.c
@@ -462,7 +462,8 @@ parse_number(PyObject *s, Py_ssize_t pos, Py_ssize_t end,
 /* not all fields of format are used.  for example, precision is
   unused.  should this take discrete params in order to be more clear
   about what it does?  or is passing a single format parameter easier
-   and more efficient enough to justify a little obfuscation? */
+   and more efficient enough to justify a little obfuscation?
+   Return -1 on error. */
 static Py_ssize_t
 calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
                   Py_UCS4 sign_char, PyObject *number, Py_ssize_t n_start,
@@ -541,9 +542,12 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix,
        Py_UCS4 grouping_maxchar;
        spec->n_grouped_digits = _PyUnicode_InsertThousandsGrouping(
            NULL, 0,
-            0, NULL,
-            spec->n_digits, spec->n_min_width,
+            NULL, 0, spec->n_digits,
+            spec->n_min_width,
            locale->grouping, locale->thousands_sep, &grouping_maxchar);
+        if (spec->n_grouped_digits == -1) {
+            return -1;
+        }
        *maxchar = Py_MAX(*maxchar, grouping_maxchar);
    }

@@ -635,26 +639,14 @@ fill_number(_PyUnicodeWriter *writer, const NumberFieldWidths *spec,
    /* Only for type 'c' special case, it has no digits. */
    if (spec->n_digits != 0) {
        /* Fill the digits with InsertThousandsGrouping. */
-        char *pdigits;
-        if (PyUnicode_READY(digits))
-            return -1;
-        pdigits = PyUnicode_DATA(digits);
-        if (PyUnicode_KIND(digits) < kind) {
-            pdigits = _PyUnicode_AsKind(digits, kind);
-            if (pdigits == NULL)
-                return -1;
-        }
        r = _PyUnicode_InsertThousandsGrouping(
-                writer->buffer, writer->pos,
-                spec->n_grouped_digits,
-                pdigits + kind * d_pos,
-                spec->n_digits, spec->n_min_width,
+                writer, spec->n_grouped_digits,
+                digits, d_pos, spec->n_digits,
+                spec->n_min_width,
                locale->grouping, locale->thousands_sep, NULL);
        if (r == -1)
            return -1;
        assert(r == spec->n_grouped_digits);
-        if (PyUnicode_KIND(digits) < kind)
-            PyMem_Free(pdigits);
        d_pos += spec->n_digits;
    }
    if (toupper) {
@@ -994,6 +986,9 @@ format_long_internal(PyObject *value, const InternalFormatSpec *format,
    n_total = calc_number_widths(&spec, n_prefix, sign_char, tmp, inumeric_chars,
                                 inumeric_chars + n_digits, n_remainder, 0,
                                 &locale, format, &maxchar);
+    if (n_total == -1) {
+        goto done;
+    }

    /* Allocate the memory. */
    if (_PyUnicodeWriter_Prepare(writer, n_total, maxchar) == -1)
@@ -1139,6 +1134,9 @@ format_float_internal(PyObject *value,
    n_total = calc_number_widths(&spec, 0, sign_char, unicode_tmp, index,
                                 index + n_digits, n_remainder, has_decimal,
                                 &locale, format, &maxchar);
+    if (n_total == -1) {
+        goto done;
+    }

    /* Allocate the memory. */
    if (_PyUnicodeWriter_Prepare(writer, n_total, maxchar) == -1)
@@ -1322,6 +1320,9 @@ format_complex_internal(PyObject *value,
                                    i_re, i_re + n_re_digits, n_re_remainder,
                                    re_has_decimal, &locale, &tmp_format,
                                    &maxchar);
+    if (n_re_total == -1) {
+        goto done;
+    }

    /* Same formatting, but always include a sign, unless the real part is
     * going to be omitted, in which case we use whatever sign convention was
@@ -1332,6 +1333,9 @@ format_complex_internal(PyObject *value,
                                    i_im, i_im + n_im_digits, n_im_remainder,
                                    im_has_decimal, &locale, &tmp_format,
                                    &maxchar);
+    if (n_im_total == -1) {
+        goto done;
+    }

    if (skip_re)
        n_re_total = 0;