Commit 3a50e705 authored by Victor Stinner's avatar Victor Stinner

Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
parent 1e73a246
...@@ -1280,12 +1280,13 @@ functions can be used directly if desired. ...@@ -1280,12 +1280,13 @@ functions can be used directly if desired.
.. module:: encodings.mbcs .. module:: encodings.mbcs
:synopsis: Windows ANSI codepage :synopsis: Windows ANSI codepage
Encode operand according to the ANSI codepage (CP_ACP). This codec only Encode operand according to the ANSI codepage (CP_ACP).
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
``'strict'`` and ``'ignore'`` error handlers to decode.
Availability: Windows only. Availability: Windows only.
.. versionchanged:: 3.3
Support any error handler.
.. versionchanged:: 3.2 .. versionchanged:: 3.2
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
to encode, and ``'ignore'`` to decode. to encode, and ``'ignore'`` to decode.
......
...@@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and ...@@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
codecs codecs
------ ------
The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
``replace`` and ``ignore`` error handlers on all Windows versions. The
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
only ``replace`` to encode and ``ignore`` to decode.
Multibyte CJK decoders now resynchronize faster. They only ignore the first Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312', byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
'replace')`` now returns a ``\n`` after the replacement character. 'replace')`` now returns a ``\n`` after the replacement character.
......
...@@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( ...@@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
Py_ssize_t *consumed /* bytes consumed */ Py_ssize_t *consumed /* bytes consumed */
); );
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
int code_page, /* code page number */
const char *string, /* encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */ PyObject *unicode /* Unicode object */
); );
...@@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( ...@@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
#ifndef Py_LIMITED_API #ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
const Py_UNICODE *data, /* Unicode char buffer */ const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors /* error handling */ const char *errors /* error handling */
); );
#endif #endif
PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
int code_page, /* code page number */
PyObject *unicode, /* Unicode object */
const char *errors /* error handling */
);
#endif /* HAVE_MBCS */ #endif /* HAVE_MBCS */
/* --- Decimal Encoder ---------------------------------------------------- */ /* --- Decimal Encoder ---------------------------------------------------- */
......
...@@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase): ...@@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
self.assertEqual(sout, b"\x80") self.assertEqual(sout, b"\x80")
class CodePageTest(unittest.TestCase):
CP_UTF8 = 65001
vista_or_later = (sys.getwindowsversion().major >= 6)
def test_invalid_code_page(self):
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
def test_code_page_name(self):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
codecs.code_page_decode, 932, b'\x81\x00')
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
codecs.code_page_decode, self.CP_UTF8, b'\xff')
def check_decode(self, cp, tests):
for raw, errors, expected in tests:
if expected is not None:
try:
decoded = codecs.code_page_decode(cp, raw, errors)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err))
self.assertEqual(decoded[0], expected,
'%a.decode("cp%s", %r)=%a != %a'
% (raw, cp, errors, decoded[0], expected))
# assert 0 <= decoded[1] <= len(raw)
self.assertGreaterEqual(decoded[1], 0)
self.assertLessEqual(decoded[1], len(raw))
else:
self.assertRaises(UnicodeDecodeError,
codecs.code_page_decode, cp, raw, errors)
def check_encode(self, cp, tests):
for text, errors, expected in tests:
if expected is not None:
try:
encoded = codecs.code_page_encode(cp, text, errors)
except UnicodeEncodeError as err:
self.fail('Unable to encode %a to "cp%s" with '
'errors=%r: %s' % (text, cp, errors, err))
self.assertEqual(encoded[0], expected,
'%a.encode("cp%s", %r)=%a != %a'
% (text, cp, errors, encoded[0], expected))
self.assertEqual(encoded[1], len(text))
else:
self.assertRaises(UnicodeEncodeError,
codecs.code_page_encode, cp, text, errors)
def test_cp932(self):
self.check_encode(932, (
('abc', 'strict', b'abc'),
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
# not encodable
('\xff', 'strict', None),
('[\xff]', 'ignore', b'[]'),
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
))
tests = [
(b'abc', 'strict', 'abc'),
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
# invalid bytes
(b'\xff', 'strict', None),
(b'\xff', 'ignore', ''),
(b'\xff', 'replace', '\ufffd'),
(b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'),
]
if self.vista_or_later:
tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
else:
tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
self.check_decode(932, tests)
def test_cp1252(self):
self.check_encode(1252, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xe9\x80'),
('\xff', 'strict', b'\xff'),
('\u0141', 'strict', None),
('\u0141', 'ignore', b''),
('\u0141', 'replace', b'L'),
))
self.check_decode(1252, (
(b'abc', 'strict', 'abc'),
(b'\xe9\x80', 'strict', '\xe9\u20ac'),
(b'\xff', 'strict', '\xff'),
))
def test_cp_utf7(self):
cp = 65000
self.check_encode(cp, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'+AOkgrA-'),
('\U0010ffff', 'strict', b'+2//f/w-'),
('\udc80', 'strict', b'+3IA-'),
('\ufffd', 'strict', b'+//0-'),
))
self.check_decode(cp, (
(b'abc', 'strict', 'abc'),
(b'+AOkgrA-', 'strict', '\xe9\u20ac'),
(b'+2//f/w-', 'strict', '\U0010ffff'),
(b'+3IA-', 'strict', '\udc80'),
(b'+//0-', 'strict', '\ufffd'),
# invalid bytes
(b'[+/]', 'strict', '[]'),
(b'[\xff]', 'strict', '[\xff]'),
))
def test_cp_utf8(self):
cp = self.CP_UTF8
tests = [
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
]
if self.vista_or_later:
tests.append(('\udc80', 'strict', None))
tests.append(('\udc80', 'ignore', b''))
tests.append(('\udc80', 'replace', b'?'))
else:
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
self.check_encode(cp, tests)
tests = [
(b'abc', 'strict', 'abc'),
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
# invalid bytes
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
]
if self.vista_or_later:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
))
else:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
))
self.check_decode(cp, tests)
def test_error_handlers(self):
self.check_encode(932, (
('\xff', 'backslashreplace', b'\\xff'),
('\xff', 'xmlcharrefreplace', b'&#255;'),
))
self.check_decode(932, (
(b'\xff', 'surrogateescape', '\udcff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
def test_multibyte_encoding(self):
self.check_decode(932, (
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
(b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
))
self.check_decode(self.CP_UTF8, (
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
))
def test_incremental(self):
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e', 2))
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9\x80', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
decoded = codecs.code_page_decode(932,
b'abc', 'strict',
False)
self.assertEqual(decoded, ('abc', 3))
def test_main(): def test_main():
support.run_unittest( support.run_unittest(
UTF32Test, UTF32Test,
...@@ -1772,6 +1969,7 @@ def test_main(): ...@@ -1772,6 +1969,7 @@ def test_main():
SurrogateEscapeTest, SurrogateEscapeTest,
BomTest, BomTest,
TransformCodecTest, TransformCodecTest,
CodePageTest,
) )
......
...@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1? ...@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
- Issue #13188: When called without an explicit traceback argument, - Issue #13188: When called without an explicit traceback argument,
generator.throw() now gets the traceback from the passed exception's generator.throw() now gets the traceback from the passed exception's
``__traceback__`` attribute. Patch by Petri Lehtinen. ``__traceback__`` attribute. Patch by Petri Lehtinen.
......
...@@ -612,6 +612,31 @@ mbcs_decode(PyObject *self, ...@@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
return codec_tuple(decoded, consumed); return codec_tuple(decoded, consumed);
} }
static PyObject *
code_page_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
&code_page, &pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeCodePageStateful(code_page,
pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
#endif /* HAVE_MBCS */ #endif /* HAVE_MBCS */
/* --- Encoder ------------------------------------------------------------ */ /* --- Encoder ------------------------------------------------------------ */
...@@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self, ...@@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
return v; return v;
} }
static PyObject *
code_page_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
&code_page, &str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
str,
errors),
PyUnicode_GET_LENGTH(str));
Py_DECREF(str);
return v;
}
#endif /* HAVE_MBCS */ #endif /* HAVE_MBCS */
/* --- Error handler registry --------------------------------------------- */ /* --- Error handler registry --------------------------------------------- */
...@@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = { ...@@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef HAVE_MBCS #ifdef HAVE_MBCS
{"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_encode", mbcs_encode, METH_VARARGS},
{"mbcs_decode", mbcs_decode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS},
{"code_page_encode", code_page_encode, METH_VARARGS},
{"code_page_decode", code_page_decode, METH_VARARGS},
#endif #endif
{"register_error", register_error, METH_VARARGS, {"register_error", register_error, METH_VARARGS,
register_error__doc__}, register_error__doc__},
......
This diff is collapsed.
...@@ -67,7 +67,7 @@ static void initsigs(void); ...@@ -67,7 +67,7 @@ static void initsigs(void);
static void call_py_exitfuncs(void); static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void); static void wait_for_thread_shutdown(void);
static void call_ll_exitfuncs(void); static void call_ll_exitfuncs(void);
extern void _PyUnicode_Init(void); extern int _PyUnicode_Init(void);
extern void _PyUnicode_Fini(void); extern void _PyUnicode_Fini(void);
extern int _PyLong_Init(void); extern int _PyLong_Init(void);
extern void PyLong_Fini(void); extern void PyLong_Fini(void);
...@@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs) ...@@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
Py_FatalError("Py_Initialize: can't make modules_reloading dictionary"); Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");
/* Init Unicode implementation; relies on the codec registry */ /* Init Unicode implementation; relies on the codec registry */
_PyUnicode_Init(); if (_PyUnicode_Init() < 0)
Py_FatalError("Py_Initialize: can't initialize unicode");
bimod = _PyBuiltin_Init(); bimod = _PyBuiltin_Init();
if (bimod == NULL) if (bimod == NULL)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment