Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

error handlers on all Windows versions. The MBCS codec is now supporting all error handlers, instead of only replace to encode and ignore to decode.

Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
error handlers on all Windows versions. The MBCS codec is now supporting all error handlers, instead of only replace to encode and ignore to decode.
3a50e705 · Victor Stinner · 1e73a246 · 3a50e705 · 3a50e705 · 3a50e705
Commit 3a50e705 authored Oct 18, 2011 by Victor Stinner
8 changed files
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1280,12 +1280,13 @@ functions can be used directly if desired.
 .. module:: encodings.mbcs
   :synopsis: Windows ANSI codepage

-Encode operand according to the ANSI codepage (CP_ACP). This codec only
-supports ``'strict'`` and ``'replace'`` error handlers to encode, and
-``'strict'`` and ``'ignore'`` error handlers to decode.
+Encode operand according to the ANSI codepage (CP_ACP).

 Availability: Windows only.

+.. versionchanged:: 3.3
+   Support any error handler.
+
 .. versionchanged:: 3.2
   Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
   to encode, and ``'ignore'`` to decode.

--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
 codecs
 ------

+The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
+``replace`` and ``ignore`` error handlers on all Windows versions. The
+:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
+only ``replace`` to encode and ``ignore`` to decode.
+
 Multibyte CJK decoders now resynchronize faster. They only ignore the first
 byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
 'replace')`` now returns a ``\n`` after the replacement character.

--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
    Py_ssize_t *consumed        /* bytes consumed */
    );

+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+    int code_page,              /* code page number */
+    const char *string,         /* encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
    PyObject *unicode           /* Unicode object */
    );
@@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
    const Py_UNICODE *data,     /* Unicode char buffer */
-    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
+    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
    const char *errors          /* error handling */
    );
 #endif

+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+    int code_page,              /* code page number */
+    PyObject *unicode,          /* Unicode object */
+    const char *errors          /* error handling */
+    );
+
 #endif /* HAVE_MBCS */

 /* --- Decimal Encoder ---------------------------------------------------- */

--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
            self.assertEqual(sout, b"\x80")


+class CodePageTest(unittest.TestCase):
+    CP_UTF8 = 65001
+    vista_or_later = (sys.getwindowsversion().major >= 6)
+
+    def test_invalid_code_page(self):
+        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
+        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
+        self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
+        self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+
+    def test_code_page_name(self):
+        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
+            codecs.code_page_encode, 932, '\xff')
+        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
+            codecs.code_page_decode, 932, b'\x81\x00')
+        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
+            codecs.code_page_decode, self.CP_UTF8, b'\xff')
+
+    def check_decode(self, cp, tests):
+        for raw, errors, expected in tests:
+            if expected is not None:
+                try:
+                    decoded = codecs.code_page_decode(cp, raw, errors)
+                except UnicodeDecodeError as err:
+                    self.fail('Unable to decode %a from "cp%s" with '
+                              'errors=%r: %s' % (raw, cp, errors, err))
+                self.assertEqual(decoded[0], expected,
+                    '%a.decode("cp%s", %r)=%a != %a'
+                    % (raw, cp, errors, decoded[0], expected))
+                # assert 0 <= decoded[1] <= len(raw)
+                self.assertGreaterEqual(decoded[1], 0)
+                self.assertLessEqual(decoded[1], len(raw))
+            else:
+                self.assertRaises(UnicodeDecodeError,
+                    codecs.code_page_decode, cp, raw, errors)
+
+    def check_encode(self, cp, tests):
+        for text, errors, expected in tests:
+            if expected is not None:
+                try:
+                    encoded = codecs.code_page_encode(cp, text, errors)
+                except UnicodeEncodeError as err:
+                    self.fail('Unable to encode %a to "cp%s" with '
+                              'errors=%r: %s' % (text, cp, errors, err))
+                self.assertEqual(encoded[0], expected,
+                    '%a.encode("cp%s", %r)=%a != %a'
+                    % (text, cp, errors, encoded[0], expected))
+                self.assertEqual(encoded[1], len(text))
+            else:
+                self.assertRaises(UnicodeEncodeError,
+                    codecs.code_page_encode, cp, text, errors)
+
+    def test_cp932(self):
+        self.check_encode(932, (
+            ('abc', 'strict', b'abc'),
+            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+            # not encodable
+            ('\xff', 'strict', None),
+            ('[\xff]', 'ignore', b'[]'),
+            ('[\xff]', 'replace', b'[y]'),
+            ('[\u20ac]', 'replace', b'[?]'),
+        ))
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
+            # invalid bytes
+            (b'\xff', 'strict', None),
+            (b'\xff', 'ignore', ''),
+            (b'\xff', 'replace', '\ufffd'),
+            (b'\x81\x00abc', 'strict', None),
+            (b'\x81\x00abc', 'ignore', '\x00abc'),
+        ]
+        if self.vista_or_later:
+            tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
+        else:
+            tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
+        self.check_decode(932, tests)
+
+    def test_cp1252(self):
+        self.check_encode(1252, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
+            ('\xff', 'strict', b'\xff'),
+            ('\u0141', 'strict', None),
+            ('\u0141', 'ignore', b''),
+            ('\u0141', 'replace', b'L'),
+        ))
+        self.check_decode(1252, (
+            (b'abc', 'strict', 'abc'),
+            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
+            (b'\xff', 'strict', '\xff'),
+        ))
+
+    def test_cp_utf7(self):
+        cp = 65000
+        self.check_encode(cp, (
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
+            ('\U0010ffff', 'strict',  b'+2//f/w-'),
+            ('\udc80', 'strict', b'+3IA-'),
+            ('\ufffd', 'strict', b'+//0-'),
+        ))
+        self.check_decode(cp, (
+            (b'abc', 'strict', 'abc'),
+            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
+            (b'+2//f/w-', 'strict', '\U0010ffff'),
+            (b'+3IA-', 'strict', '\udc80'),
+            (b'+//0-', 'strict', '\ufffd'),
+            # invalid bytes
+            (b'[+/]', 'strict', '[]'),
+            (b'[\xff]', 'strict', '[\xff]'),
+        ))
+
+    def test_cp_utf8(self):
+        cp = self.CP_UTF8
+
+        tests = [
+            ('abc', 'strict', b'abc'),
+            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
+            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+        ]
+        if self.vista_or_later:
+            tests.append(('\udc80', 'strict', None))
+            tests.append(('\udc80', 'ignore', b''))
+            tests.append(('\udc80', 'replace', b'?'))
+        else:
+            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
+        self.check_encode(cp, tests)
+
+        tests = [
+            (b'abc', 'strict', 'abc'),
+            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
+            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
+            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
+            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
+            # invalid bytes
+            (b'[\xff]', 'strict', None),
+            (b'[\xff]', 'ignore', '[]'),
+            (b'[\xff]', 'replace', '[\ufffd]'),
+        ]
+        if self.vista_or_later:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', None),
+                (b'[\xed\xb2\x80]', 'ignore', '[]'),
+                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
+            ))
+        else:
+            tests.extend((
+                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
+            ))
+        self.check_decode(cp, tests)
+
+    def test_error_handlers(self):
+        self.check_encode(932, (
+            ('\xff', 'backslashreplace', b'\\xff'),
+            ('\xff', 'xmlcharrefreplace', b'&#255;'),
+        ))
+        self.check_decode(932, (
+            (b'\xff', 'surrogateescape', '\udcff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
+            ))
+
+    def test_multibyte_encoding(self):
+        self.check_decode(932, (
+            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
+            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
+        ))
+        self.check_decode(self.CP_UTF8, (
+            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
+            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
+        ))
+        if self.vista_or_later:
+            self.check_encode(self.CP_UTF8, (
+                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+            ))
+
+    def test_incremental(self):
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e', 2))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'\xe9\x80\xe9\x80', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
+
+        decoded = codecs.code_page_decode(932,
+                                          b'abc', 'strict',
+                                          False)
+        self.assertEqual(decoded, ('abc', 3))
+
+
 def test_main():
    support.run_unittest(
        UTF32Test,
@@ -1772,6 +1969,7 @@ def test_main():
        SurrogateEscapeTest,
        BomTest,
        TransformCodecTest,
+        CodePageTest,
    )



--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
+  error handlers on all Windows versions. The MBCS codec is now supporting all
+  error handlers, instead of only replace to encode and ignore to decode.
+
 - Issue #13188: When called without an explicit traceback argument,
  generator.throw() now gets the traceback from the passed exception's
  ``__traceback__`` attribute.  Patch by Petri Lehtinen.

--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
    return codec_tuple(decoded, consumed);
 }

+static PyObject *
+code_page_decode(PyObject *self,
+                 PyObject *args)
+{
+    Py_buffer pbuf;
+    const char *errors = NULL;
+    int final = 0;
+    Py_ssize_t consumed;
+    PyObject *decoded = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
+                          &code_page, &pbuf, &errors, &final))
+        return NULL;
+    consumed = pbuf.len;
+
+    decoded = PyUnicode_DecodeCodePageStateful(code_page,
+                                               pbuf.buf, pbuf.len, errors,
+                                               final ? NULL : &consumed);
+    PyBuffer_Release(&pbuf);
+    if (decoded == NULL)
+        return NULL;
+    return codec_tuple(decoded, consumed);
+}
+
 #endif /* HAVE_MBCS */

 /* --- Encoder ------------------------------------------------------------ */
@@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
    return v;
 }

+static PyObject *
+code_page_encode(PyObject *self,
+                 PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+    int code_page;
+
+    if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
+                          &code_page, &str, &errors))
+        return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+        return NULL;
+    v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
+                                             str,
+                                             errors),
+                    PyUnicode_GET_LENGTH(str));
+    Py_DECREF(str);
+    return v;
+}
+
 #endif /* HAVE_MBCS */

 /* --- Error handler registry --------------------------------------------- */
@@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
 #ifdef HAVE_MBCS
    {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
    {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
+    {"code_page_encode",        code_page_encode,               METH_VARARGS},
+    {"code_page_decode",        code_page_decode,               METH_VARARGS},
 #endif
    {"register_error",          register_error,                 METH_VARARGS,
        register_error__doc__},

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -67,7 +67,7 @@ static void initsigs(void);
 static void call_py_exitfuncs(void);
 static void wait_for_thread_shutdown(void);
 static void call_ll_exitfuncs(void);
-extern void _PyUnicode_Init(void);
+extern int _PyUnicode_Init(void);
 extern void _PyUnicode_Fini(void);
 extern int _PyLong_Init(void);
 extern void PyLong_Fini(void);
@@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
        Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");

    /* Init Unicode implementation; relies on the codec registry */
-    _PyUnicode_Init();
+    if (_PyUnicode_Init() < 0)
+        Py_FatalError("Py_Initialize: can't initialize unicode");

    bimod = _PyBuiltin_Init();
    if (bimod == NULL)