Close #13093: PyUnicode_EncodeDecimal() doesn't support error handlers

different than "strict" anymore. The caller was unable to compute the size of the output buffer: it depends on the error handler.

Close #13093: PyUnicode_EncodeDecimal() doesn't support error handlers
different than "strict" anymore. The caller was unable to compute the size of the output buffer: it depends on the error handler.
6345be9a · Victor Stinner · e7ede067 · 6345be9a · 6345be9a · 6345be9a
Commit 6345be9a authored Nov 25, 2011 by Victor Stinner
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 127 deletions

Lib/test/test_unicode.py Lib/test/test_unicode.py +4 -14

Misc/NEWS Misc/NEWS +4 -0

Objects/unicodeobject.c Objects/unicodeobject.c +18 -113

No files found.
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1816,20 +1816,10 @@ class UnicodeTest(string_tests.CommonTest,
                         b' 3.14 ')
        self.assertRaises(UnicodeEncodeError,
                          unicode_encodedecimal, "123\u20ac", "strict")
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
+        self.assertRaisesRegex(
-                         b'123?')
+            ValueError,
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
+            "^'decimal' codec can't encode character",
-                         b'123')
+            unicode_encodedecimal, "123\u20ac", "replace")
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
-                         b'123&#8364;')
-        self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
-                         b'123\\u20ac')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\N{EM SPACE}", "replace"),
-                         b'123? ')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\u20ac", "replace"),
-                         b'123??')
-        self.assertEqual(unicode_encodedecimal("123\u20ac\u0660", "replace"),
-                         b'123?0')
    def test_transform_decimal(self):
        from _testcapi import unicode_transformdecimaltoascii as transform_decimal

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------
+- Issue #13093: PyUnicode_EncodeDecimal() doesn't support error handlers
+  different than "strict" anymore. The caller was unable to compute the
+  size of the output buffer: it depends on the error handler.
 - PEP 3155 / issue #13448: Qualified name for classes and functions.
 - Issue #13436: Fix a bogus error message when an AST object was passed

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8839,15 +8839,8 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
                        char *output,
                        const char *errors)
 {
-    PyObject *errorHandler = NULL;
-    PyObject *exc = NULL;
    PyObject *unicode;
-    const char *encoding = "decimal";
+    Py_ssize_t i;
-    const char *reason = "invalid decimal Unicode string";
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
-    Py_ssize_t i, j;
    enum PyUnicode_Kind kind;
    void *data;
@@ -8860,15 +8853,20 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
    if (unicode == NULL)
        return -1;
-    if (PyUnicode_READY(unicode) < 0)
+    if (PyUnicode_READY(unicode) < 0) {
-        goto onError;
+        Py_DECREF(unicode);
+        return -1;
+    }
    kind = PyUnicode_KIND(unicode);
    data = PyUnicode_DATA(unicode);
    for (i=0; i < length; ) {
-        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        PyObject *exc;
+        Py_UCS4 ch;
        int decimal;
-        Py_ssize_t startpos, endpos;
+        Py_ssize_t startpos;
+        ch = PyUnicode_READ(kind, data, i);
        if (Py_UNICODE_ISSPACE(ch)) {
            *output++ = ' ';
@@ -8886,113 +8884,20 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
            i++;
            continue;
        }
-        /* All other characters are considered unencodable */
-        startpos = i;
-        endpos = i+1;
-        for (; endpos < length; endpos++) {
-            ch = PyUnicode_READ(kind, data, endpos);
-            if ((0 < ch && ch < 256) ||
-                Py_UNICODE_ISSPACE(ch) ||
-                0 <= Py_UNICODE_TODECIMAL(ch))
-                break;
-        }
-        /* cache callback name lookup
-         * (if not done yet, i.e. it's the first error) */
-        if (known_errorHandler==-1) {
-            if ((errors==NULL) || (!strcmp(errors, "strict")))
-                known_errorHandler = 1;
-            else if (!strcmp(errors, "replace"))
-                known_errorHandler = 2;
-            else if (!strcmp(errors, "ignore"))
-                known_errorHandler = 3;
-            else if (!strcmp(errors, "xmlcharrefreplace"))
-                known_errorHandler = 4;
-            else
-                known_errorHandler = 0;
-        }
-        switch (known_errorHandler) {
-        case 1: /* strict */
-            raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
-            goto onError;
-        case 2: /* replace */
-            for (j=startpos; j < endpos; j++)
-                *output++ = '?';
-            i = endpos;
-            break;
-        case 3: /* ignore */
-            i = endpos;
-            break;
-        case 4: /* xmlcharrefreplace */
-            /* generate replacement */
-            for (j=startpos; j < endpos; j++) {
-                ch = PyUnicode_READ(kind, data, i);
-                output += sprintf(output, "&#%d;", (int)ch);
-                i++;
-            }
-            break;
-        default:
-        {
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, k;
-            enum PyUnicode_Kind repkind;
-            void *repdata;
-            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                          encoding, reason, unicode, &exc,
-                                                          startpos, endpos, &newpos);
-            if (repunicode == NULL)
-                goto onError;
-            if (!PyUnicode_Check(repunicode)) {
-                /* Byte results not supported, since they have no decimal property. */
-                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
-                Py_DECREF(repunicode);
-                goto onError;
-            }
-            if (PyUnicode_READY(repunicode) < 0) {
-                Py_DECREF(repunicode);
-                goto onError;
-            }
-            repkind = PyUnicode_KIND(repunicode);
-            repdata = PyUnicode_DATA(repunicode);
-            /* generate replacement  */
+        startpos = i;
-            repsize = PyUnicode_GET_SIZE(repunicode);
+        exc = NULL;
-            for (k=0; k<repsize; k++) {
+        raise_encode_exception(&exc, "decimal", unicode,
-                ch = PyUnicode_READ(repkind, repdata, k);
+                               startpos, startpos+1,
-                if (Py_UNICODE_ISSPACE(ch))
+                               "invalid decimal Unicode string");
-                    *output++ = ' ';
+        Py_XDECREF(exc);
-                else {
+        Py_DECREF(unicode);
-                    decimal = Py_UNICODE_TODECIMAL(ch);
+        return -1;
-                    if (decimal >= 0)
-                        *output++ = '0' + decimal;
-                    else if (0 < ch && ch < 256)
-                        *output++ = (char)ch;
-                    else {
-                        Py_DECREF(repunicode);
-                        raise_encode_exception(&exc, encoding,
-                                               unicode, startpos, endpos,
-                                               reason);
-                        goto onError;
-                    }
-                }
-            }
-            i = newpos;
-            Py_DECREF(repunicode);
-        }
-        }
    }
    /* 0-terminate the output string */
    *output++ = '\0';
-    Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
    Py_DECREF(unicode);
    return 0;
-  onError:
-    Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
-    Py_DECREF(unicode);
-    return -1;
 }
 /* --- Helpers ------------------------------------------------------------ */