Commit 3a50e705 authored by Victor Stinner's avatar Victor Stinner

Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore

error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
parent 1e73a246
......@@ -1280,12 +1280,13 @@ functions can be used directly if desired.
.. module:: encodings.mbcs
:synopsis: Windows ANSI codepage
Encode operand according to the ANSI codepage (CP_ACP). This codec only
supports ``'strict'`` and ``'replace'`` error handlers to encode, and
``'strict'`` and ``'ignore'`` error handlers to decode.
Encode operand according to the ANSI codepage (CP_ACP).
Availability: Windows only.
.. versionchanged:: 3.3
Support any error handler.
.. versionchanged:: 3.2
Before 3.2, the *errors* argument was ignored; ``'replace'`` was always used
to encode, and ``'ignore'`` to decode.
......
......@@ -197,6 +197,11 @@ The :mod:`array` module supports the :c:type:`long long` type using ``q`` and
codecs
------
The :mod:`~encodings.mbcs` codec has be rewritten to handle correclty
``replace`` and ``ignore`` error handlers on all Windows versions. The
:mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of
only ``replace`` to encode and ``ignore`` to decode.
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, ``b'\xff\n'.decode('gb2312',
'replace')`` now returns a ``\n`` after the replacement character.
......
......@@ -1466,6 +1466,14 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
int code_page, /* code page number */
const char *string, /* encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */
);
......@@ -1473,11 +1481,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
const char *errors /* error handling */
);
#endif
PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
int code_page, /* code page number */
PyObject *unicode, /* Unicode object */
const char *errors /* error handling */
);
#endif /* HAVE_MBCS */
/* --- Decimal Encoder ---------------------------------------------------- */
......
......@@ -1744,6 +1744,203 @@ class TransformCodecTest(unittest.TestCase):
self.assertEqual(sout, b"\x80")
class CodePageTest(unittest.TestCase):
CP_UTF8 = 65001
vista_or_later = (sys.getwindowsversion().major >= 6)
def test_invalid_code_page(self):
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
def test_code_page_name(self):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
codecs.code_page_decode, 932, b'\x81\x00')
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
codecs.code_page_decode, self.CP_UTF8, b'\xff')
def check_decode(self, cp, tests):
for raw, errors, expected in tests:
if expected is not None:
try:
decoded = codecs.code_page_decode(cp, raw, errors)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err))
self.assertEqual(decoded[0], expected,
'%a.decode("cp%s", %r)=%a != %a'
% (raw, cp, errors, decoded[0], expected))
# assert 0 <= decoded[1] <= len(raw)
self.assertGreaterEqual(decoded[1], 0)
self.assertLessEqual(decoded[1], len(raw))
else:
self.assertRaises(UnicodeDecodeError,
codecs.code_page_decode, cp, raw, errors)
def check_encode(self, cp, tests):
for text, errors, expected in tests:
if expected is not None:
try:
encoded = codecs.code_page_encode(cp, text, errors)
except UnicodeEncodeError as err:
self.fail('Unable to encode %a to "cp%s" with '
'errors=%r: %s' % (text, cp, errors, err))
self.assertEqual(encoded[0], expected,
'%a.encode("cp%s", %r)=%a != %a'
% (text, cp, errors, encoded[0], expected))
self.assertEqual(encoded[1], len(text))
else:
self.assertRaises(UnicodeEncodeError,
codecs.code_page_encode, cp, text, errors)
def test_cp932(self):
self.check_encode(932, (
('abc', 'strict', b'abc'),
('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
# not encodable
('\xff', 'strict', None),
('[\xff]', 'ignore', b'[]'),
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
))
tests = [
(b'abc', 'strict', 'abc'),
(b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
# invalid bytes
(b'\xff', 'strict', None),
(b'\xff', 'ignore', ''),
(b'\xff', 'replace', '\ufffd'),
(b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'),
]
if self.vista_or_later:
tests.append((b'\x81\x00abc', 'replace', '\ufffd\x00abc'))
else:
tests.append((b'\x81\x00abc', 'replace', '\x00\x00abc'))
self.check_decode(932, tests)
def test_cp1252(self):
self.check_encode(1252, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xe9\x80'),
('\xff', 'strict', b'\xff'),
('\u0141', 'strict', None),
('\u0141', 'ignore', b''),
('\u0141', 'replace', b'L'),
))
self.check_decode(1252, (
(b'abc', 'strict', 'abc'),
(b'\xe9\x80', 'strict', '\xe9\u20ac'),
(b'\xff', 'strict', '\xff'),
))
def test_cp_utf7(self):
cp = 65000
self.check_encode(cp, (
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'+AOkgrA-'),
('\U0010ffff', 'strict', b'+2//f/w-'),
('\udc80', 'strict', b'+3IA-'),
('\ufffd', 'strict', b'+//0-'),
))
self.check_decode(cp, (
(b'abc', 'strict', 'abc'),
(b'+AOkgrA-', 'strict', '\xe9\u20ac'),
(b'+2//f/w-', 'strict', '\U0010ffff'),
(b'+3IA-', 'strict', '\udc80'),
(b'+//0-', 'strict', '\ufffd'),
# invalid bytes
(b'[+/]', 'strict', '[]'),
(b'[\xff]', 'strict', '[\xff]'),
))
def test_cp_utf8(self):
cp = self.CP_UTF8
tests = [
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
]
if self.vista_or_later:
tests.append(('\udc80', 'strict', None))
tests.append(('\udc80', 'ignore', b''))
tests.append(('\udc80', 'replace', b'?'))
else:
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
self.check_encode(cp, tests)
tests = [
(b'abc', 'strict', 'abc'),
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
# invalid bytes
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
]
if self.vista_or_later:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
))
else:
tests.extend((
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
))
self.check_decode(cp, tests)
def test_error_handlers(self):
self.check_encode(932, (
('\xff', 'backslashreplace', b'\\xff'),
('\xff', 'xmlcharrefreplace', b'&#255;'),
))
self.check_decode(932, (
(b'\xff', 'surrogateescape', '\udcff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
def test_multibyte_encoding(self):
self.check_decode(932, (
(b'\x84\xe9\x80', 'ignore', '\u9a3e'),
(b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
))
self.check_decode(self.CP_UTF8, (
(b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
(b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
))
if self.vista_or_later:
self.check_encode(self.CP_UTF8, (
('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
))
def test_incremental(self):
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e', 2))
decoded = codecs.code_page_decode(932,
b'\xe9\x80\xe9\x80', 'strict',
False)
self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
decoded = codecs.code_page_decode(932,
b'abc', 'strict',
False)
self.assertEqual(decoded, ('abc', 3))
def test_main():
support.run_unittest(
UTF32Test,
......@@ -1772,6 +1969,7 @@ def test_main():
SurrogateEscapeTest,
BomTest,
TransformCodecTest,
CodePageTest,
)
......
......@@ -10,6 +10,10 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Issue #12281: Rewrite the MBCS codec to handle correctly replace and ignore
error handlers on all Windows versions. The MBCS codec is now supporting all
error handlers, instead of only replace to encode and ignore to decode.
- Issue #13188: When called without an explicit traceback argument,
generator.throw() now gets the traceback from the passed exception's
``__traceback__`` attribute. Patch by Petri Lehtinen.
......
......@@ -612,6 +612,31 @@ mbcs_decode(PyObject *self,
return codec_tuple(decoded, consumed);
}
static PyObject *
code_page_decode(PyObject *self,
PyObject *args)
{
Py_buffer pbuf;
const char *errors = NULL;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iy*|zi:code_page_decode",
&code_page, &pbuf, &errors, &final))
return NULL;
consumed = pbuf.len;
decoded = PyUnicode_DecodeCodePageStateful(code_page,
pbuf.buf, pbuf.len, errors,
final ? NULL : &consumed);
PyBuffer_Release(&pbuf);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
#endif /* HAVE_MBCS */
/* --- Encoder ------------------------------------------------------------ */
......@@ -1011,6 +1036,29 @@ mbcs_encode(PyObject *self,
return v;
}
static PyObject *
code_page_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int code_page;
if (!PyArg_ParseTuple(args, "iO|z:code_page_encode",
&code_page, &str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeCodePage(code_page,
str,
errors),
PyUnicode_GET_LENGTH(str));
Py_DECREF(str);
return v;
}
#endif /* HAVE_MBCS */
/* --- Error handler registry --------------------------------------------- */
......@@ -1101,6 +1149,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef HAVE_MBCS
{"mbcs_encode", mbcs_encode, METH_VARARGS},
{"mbcs_decode", mbcs_decode, METH_VARARGS},
{"code_page_encode", code_page_encode, METH_VARARGS},
{"code_page_decode", code_page_decode, METH_VARARGS},
#endif
{"register_error", register_error, METH_VARARGS,
register_error__doc__},
......
This diff is collapsed.
......@@ -67,7 +67,7 @@ static void initsigs(void);
static void call_py_exitfuncs(void);
static void wait_for_thread_shutdown(void);
static void call_ll_exitfuncs(void);
extern void _PyUnicode_Init(void);
extern int _PyUnicode_Init(void);
extern void _PyUnicode_Fini(void);
extern int _PyLong_Init(void);
extern void PyLong_Fini(void);
......@@ -261,7 +261,8 @@ Py_InitializeEx(int install_sigs)
Py_FatalError("Py_Initialize: can't make modules_reloading dictionary");
/* Init Unicode implementation; relies on the codec registry */
_PyUnicode_Init();
if (_PyUnicode_Init() < 0)
Py_FatalError("Py_Initialize: can't initialize unicode");
bimod = _PyBuiltin_Init();
if (bimod == NULL)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment