Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup

f5aba584 · Steve Dower · 22d0698d · f5aba584 · f5aba584 · f5aba584
Commit f5aba584 authored Sep 06, 2016 by Steve Dower
8 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1663,7 +1663,7 @@ PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(

 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
    const char *string,         /* MBCS encoded string */
-    Py_ssize_t length,              /* size of string */
+    Py_ssize_t length,          /* size of string */
    const char *errors          /* error handling */
    );


--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -29,6 +29,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
 """#"

 import codecs
+import sys
 from . import aliases

 _cache = {}
@@ -151,3 +152,12 @@ def search_function(encoding):

 # Register the search_function in the Python codec registry
 codecs.register(search_function)
+
+if sys.platform == 'win32':
+    def _alias_mbcs(encoding):
+        import _bootlocale
+        if encoding == _bootlocale.getpreferredencoding(False):
+            import encodings.mbcs
+            return encodings.mbcs.getregentry()
+
+    codecs.register(_alias_mbcs)
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -458,6 +458,7 @@ aliases = {
    'macturkish'         : 'mac_turkish',

    # mbcs codec
+    'ansi'               : 'mbcs',
    'dbcs'               : 'mbcs',

    # ptcp154 codec

--- a/Lib/encodings/oem.py
+++ b/Lib/encodings/oem.py
+""" Python 'oem' Codec for Windows
+
+"""
+# Import them explicitly to cause an ImportError
+# on non-Windows systems
+from codecs import oem_encode, oem_decode
+# for IncrementalDecoder, IncrementalEncoder, ...
+import codecs
+
+### Codec APIs
+
+encode = oem_encode
+
+def decode(input, errors='strict'):
+    return oem_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return oem_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    _buffer_decode = oem_decode
+
+class StreamWriter(codecs.StreamWriter):
+    encode = oem_encode
+
+class StreamReader(codecs.StreamReader):
+    decode = oem_decode
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='oem',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )
--- a/Lib/site.py
+++ b/Lib/site.py
@@ -423,21 +423,6 @@ def enablerlcompleter():

    sys.__interactivehook__ = register_readline

-def aliasmbcs():
-    """On Windows, some default encodings are not provided by Python,
-    while they are always available as "mbcs" in each locale. Make
-    them usable by aliasing to "mbcs" in such a case."""
-    if sys.platform == 'win32':
-        import _bootlocale, codecs
-        enc = _bootlocale.getpreferredencoding(False)
-        if enc.startswith('cp'):            # "cp***" ?
-            try:
-                codecs.lookup(enc)
-            except LookupError:
-                import encodings
-                encodings._cache[enc] = encodings._unknown
-                encodings.aliases.aliases[enc] = 'mbcs'
-
 CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$'

 def venv(known_paths):
@@ -560,7 +545,6 @@ def main():
    setcopyright()
    sethelper()
    enablerlcompleter()
-    aliasmbcs()
    execsitecustomize()
    if ENABLE_USER_SITE:
        execusercustomize()

--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -8,11 +8,6 @@ import encodings

 from test import support

-if sys.platform == 'win32':
-    VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
-else:
-    VISTA_OR_LATER = False
-
 try:
    import ctypes
 except ImportError:
@@ -841,18 +836,13 @@ class CP65001Test(ReadTest, unittest.TestCase):
            ('abc', 'strict', b'abc'),
            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+            ('\udc80', 'strict', None),
+            ('\udc80', 'ignore', b''),
+            ('\udc80', 'replace', b'?'),
+            ('\udc80', 'backslashreplace', b'\\udc80'),
+            ('\udc80', 'namereplace', b'\\udc80'),
+            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
        ]
-        if VISTA_OR_LATER:
-            tests.extend((
-                ('\udc80', 'strict', None),
-                ('\udc80', 'ignore', b''),
-                ('\udc80', 'replace', b'?'),
-                ('\udc80', 'backslashreplace', b'\\udc80'),
-                ('\udc80', 'namereplace', b'\\udc80'),
-                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
-            ))
-        else:
-            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
        for text, errors, expected in tests:
            if expected is not None:
                try:
@@ -879,17 +869,10 @@ class CP65001Test(ReadTest, unittest.TestCase):
            (b'[\xff]', 'ignore', '[]'),
            (b'[\xff]', 'replace', '[\ufffd]'),
            (b'[\xff]', 'surrogateescape', '[\udcff]'),
+            (b'[\xed\xb2\x80]', 'strict', None),
+            (b'[\xed\xb2\x80]', 'ignore', '[]'),
+            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
        ]
-        if VISTA_OR_LATER:
-            tests.extend((
-                (b'[\xed\xb2\x80]', 'strict', None),
-                (b'[\xed\xb2\x80]', 'ignore', '[]'),
-                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
-            ))
-        else:
-            tests.extend((
-                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
-            ))
        for raw, errors, expected in tests:
            if expected is not None:
                try:
@@ -904,7 +887,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
                self.assertRaises(UnicodeDecodeError,
                    raw.decode, 'cp65001', errors)

-    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
    def test_lone_surrogates(self):
        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
@@ -921,7 +903,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
        self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
                         b'[?]')

-    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
    def test_surrogatepass_handler(self):
        self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
                         b"abc\xed\xa0\x80def")
@@ -1951,6 +1932,8 @@ all_unicode_encodings = [

 if hasattr(codecs, "mbcs_encode"):
    all_unicode_encodings.append("mbcs")
+if hasattr(codecs, "oem_encode"):
+    all_unicode_encodings.append("oem")

 # The following encoding is not tested, because it's not supposed
 # to work:
@@ -3119,11 +3102,10 @@ class CodePageTest(unittest.TestCase):
            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
        ))
-        if VISTA_OR_LATER:
-            self.check_encode(self.CP_UTF8, (
-                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
-                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
-            ))
+        self.check_encode(self.CP_UTF8, (
+            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+        ))

    def test_incremental(self):
        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
@@ -3144,6 +3126,20 @@ class CodePageTest(unittest.TestCase):
                                          False)
        self.assertEqual(decoded, ('abc', 3))

+    def test_mbcs_alias(self):
+        # Check that looking up our 'default' codepage will return
+        # mbcs when we don't have a more specific one available
+        import _bootlocale
+        def _get_fake_codepage(*a):
+            return 'cp123'
+        old_getpreferredencoding = _bootlocale.getpreferredencoding
+        _bootlocale.getpreferredencoding = _get_fake_codepage
+        try:
+            codec = codecs.lookup('cp123')
+            self.assertEqual(codec.name, 'mbcs')
+        finally:
+            _bootlocale.getpreferredencoding = old_getpreferredencoding
+

 class ASCIITest(unittest.TestCase):
    def test_encode(self):

--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -625,6 +625,25 @@ _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
    return codec_tuple(decoded, consumed);
 }

+/*[clinic input]
+_codecs.oem_decode
+    data: Py_buffer
+    errors: str(accept={str, NoneType}) = NULL
+    final: int(c_default="0") = False
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
+                        const char *errors, int final)
+/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
+{
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
+        data->buf, data->len, errors, final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
+}
+
 /*[clinic input]
 _codecs.code_page_decode
    codepage: int
@@ -970,6 +989,21 @@ _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
                       PyUnicode_GET_LENGTH(str));
 }

+/*[clinic input]
+_codecs.oem_encode
+    str: unicode
+    errors: str(accept={str, NoneType}) = NULL
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
+/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
+{
+    return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
+        PyUnicode_GET_LENGTH(str));
+}
+
 /*[clinic input]
 _codecs.code_page_encode
    code_page: int
@@ -1075,6 +1109,8 @@ static PyMethodDef _codecs_functions[] = {
    _CODECS_READBUFFER_ENCODE_METHODDEF
    _CODECS_MBCS_ENCODE_METHODDEF
    _CODECS_MBCS_DECODE_METHODDEF
+    _CODECS_OEM_ENCODE_METHODDEF
+    _CODECS_OEM_DECODE_METHODDEF
    _CODECS_CODE_PAGE_ENCODE_METHODDEF
    _CODECS_CODE_PAGE_DECODE_METHODDEF
    _CODECS_REGISTER_ERROR_METHODDEF

--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -805,6 +805,45 @@ exit:

 #if defined(HAVE_MBCS)

+PyDoc_STRVAR(_codecs_oem_decode__doc__,
+"oem_decode($module, data, errors=None, final=False, /)\n"
+"--\n"
+"\n");
+
+#define _CODECS_OEM_DECODE_METHODDEF    \
+    {"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__},
+
+static PyObject *
+_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
+                        const char *errors, int final);
+
+static PyObject *
+_codecs_oem_decode(PyObject *module, PyObject *args)
+{
+    PyObject *return_value = NULL;
+    Py_buffer data = {NULL, NULL};
+    const char *errors = NULL;
+    int final = 0;
+
+    if (!PyArg_ParseTuple(args, "y*|zi:oem_decode",
+        &data, &errors, &final)) {
+        goto exit;
+    }
+    return_value = _codecs_oem_decode_impl(module, &data, errors, final);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+
+    return return_value;
+}
+
+#endif /* defined(HAVE_MBCS) */
+
+#if defined(HAVE_MBCS)
+
 PyDoc_STRVAR(_codecs_code_page_decode__doc__,
 "code_page_decode($module, codepage, data, errors=None, final=False, /)\n"
 "--\n"
@@ -1346,6 +1385,38 @@ exit:

 #if defined(HAVE_MBCS)

+PyDoc_STRVAR(_codecs_oem_encode__doc__,
+"oem_encode($module, str, errors=None, /)\n"
+"--\n"
+"\n");
+
+#define _CODECS_OEM_ENCODE_METHODDEF    \
+    {"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__},
+
+static PyObject *
+_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors);
+
+static PyObject *
+_codecs_oem_encode(PyObject *module, PyObject *args)
+{
+    PyObject *return_value = NULL;
+    PyObject *str;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "U|z:oem_encode",
+        &str, &errors)) {
+        goto exit;
+    }
+    return_value = _codecs_oem_encode_impl(module, str, errors);
+
+exit:
+    return return_value;
+}
+
+#endif /* defined(HAVE_MBCS) */
+
+#if defined(HAVE_MBCS)
+
 PyDoc_STRVAR(_codecs_code_page_encode__doc__,
 "code_page_encode($module, code_page, str, errors=None, /)\n"
 "--\n"
@@ -1446,6 +1517,10 @@ exit:
    #define _CODECS_MBCS_DECODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */

+#ifndef _CODECS_OEM_DECODE_METHODDEF
+    #define _CODECS_OEM_DECODE_METHODDEF
+#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */
+
 #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF
    #define _CODECS_CODE_PAGE_DECODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */
@@ -1454,7 +1529,11 @@ exit:
    #define _CODECS_MBCS_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */

+#ifndef _CODECS_OEM_ENCODE_METHODDEF
+    #define _CODECS_OEM_ENCODE_METHODDEF
+#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */
+
 #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
    #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/