Commit 7d00cc1a authored by Victor Stinner's avatar Victor Stinner

Issue #20574: Implement incremental decoder for cp65001 code

(Windows code page 65001, Microsoft UTF-8).
parent c4992674
...@@ -11,20 +11,23 @@ if not hasattr(codecs, 'code_page_encode'): ...@@ -11,20 +11,23 @@ if not hasattr(codecs, 'code_page_encode'):
### Codec APIs ### Codec APIs
encode = functools.partial(codecs.code_page_encode, 65001) encode = functools.partial(codecs.code_page_encode, 65001)
decode = functools.partial(codecs.code_page_decode, 65001) _decode = functools.partial(codecs.code_page_decode, 65001)
def decode(input, errors='strict'):
return codecs.code_page_decode(65001, input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder): class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False): def encode(self, input, final=False):
return encode(input, self.errors)[0] return encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder): class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = decode _buffer_decode = _decode
class StreamWriter(codecs.StreamWriter): class StreamWriter(codecs.StreamWriter):
encode = encode encode = encode
class StreamReader(codecs.StreamReader): class StreamReader(codecs.StreamReader):
decode = decode decode = _decode
### encodings module API ### encodings module API
......
...@@ -890,10 +890,6 @@ class CP65001Test(ReadTest, unittest.TestCase): ...@@ -890,10 +890,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
"\U00010fff\uD800") "\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass")) self.assertTrue(codecs.lookup_error("surrogatepass"))
def test_readline(self):
self.skipTest("issue #20571: code page 65001 codec does not "
"support partial decoder yet")
class UTF7Test(ReadTest, unittest.TestCase): class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7" encoding = "utf-7"
...@@ -2750,15 +2746,15 @@ class CodePageTest(unittest.TestCase): ...@@ -2750,15 +2746,15 @@ class CodePageTest(unittest.TestCase):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932', self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff') codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932', self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
codecs.code_page_decode, 932, b'\x81\x00') codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
codecs.code_page_decode, self.CP_UTF8, b'\xff') codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
def check_decode(self, cp, tests): def check_decode(self, cp, tests):
for raw, errors, expected in tests: for raw, errors, expected in tests:
if expected is not None: if expected is not None:
try: try:
decoded = codecs.code_page_decode(cp, raw, errors) decoded = codecs.code_page_decode(cp, raw, errors, True)
except UnicodeDecodeError as err: except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with ' self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err)) 'errors=%r: %s' % (raw, cp, errors, err))
...@@ -2770,7 +2766,7 @@ class CodePageTest(unittest.TestCase): ...@@ -2770,7 +2766,7 @@ class CodePageTest(unittest.TestCase):
self.assertLessEqual(decoded[1], len(raw)) self.assertLessEqual(decoded[1], len(raw))
else: else:
self.assertRaises(UnicodeDecodeError, self.assertRaises(UnicodeDecodeError,
codecs.code_page_decode, cp, raw, errors) codecs.code_page_decode, cp, raw, errors, True)
def check_encode(self, cp, tests): def check_encode(self, cp, tests):
for text, errors, expected in tests: for text, errors, expected in tests:
......
...@@ -13,6 +13,9 @@ Core and Builtins ...@@ -13,6 +13,9 @@ Core and Builtins
Library Library
------- -------
- Issue #20574: Implement incremental decoder for cp65001 code (Windows code
page 65001, Microsoft UTF-8).
- Issue #20879: Delay the initialization of encoding and decoding tables for - Issue #20879: Delay the initialization of encoding and decoding tables for
base32, ascii85 and base85 codecs in the base64 module, and delay the base32, ascii85 and base85 codecs in the base64 module, and delay the
initialization of the unquote_to_bytes() table of the urllib.parse module, to initialization of the unquote_to_bytes() table of the urllib.parse module, to
......
...@@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj) ...@@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj)
return PyBytes_AS_STRING(*obj); return PyBytes_AS_STRING(*obj);
} }
static int
is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
{
const char *curr = s + offset;
const char *prev;
if (!IsDBCSLeadByteEx(code_page, *curr))
return 0;
prev = CharPrevExA(code_page, s, curr, 0);
if (prev == curr)
return 1;
/* FIXME: This code is limited to "true" double-byte encodings,
as it assumes an incomplete character consists of a single
byte. */
if (curr - prev == 2)
return 1;
if (!IsDBCSLeadByteEx(code_page, *prev))
return 1;
return 0;
}
static DWORD static DWORD
decode_code_page_flags(UINT code_page) decode_code_page_flags(UINT code_page)
{ {
...@@ -6913,7 +6891,7 @@ static int ...@@ -6913,7 +6891,7 @@ static int
decode_code_page_errors(UINT code_page, decode_code_page_errors(UINT code_page,
PyObject **v, PyObject **v,
const char *in, const int size, const char *in, const int size,
const char *errors) const char *errors, int final)
{ {
const char *startin = in; const char *startin = in;
const char *endin = in + size; const char *endin = in + size;
...@@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page, ...@@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page,
if (encoding == NULL) if (encoding == NULL)
return -1; return -1;
if (errors == NULL || strcmp(errors, "strict") == 0) { if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */ UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason); make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
...@@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page, ...@@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page,
if (outsize <= 0) { if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos; Py_ssize_t startinpos, endinpos, outpos;
/* last character in partial decode? */
if (in + insize >= endin && !final)
break;
startinpos = in - startin; startinpos = in - startin;
endinpos = startinpos + 1; endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v); outpos = out - PyUnicode_AS_UNICODE(*v);
...@@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page, ...@@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page,
assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0) if (unicode_resize(v, outsize) < 0)
goto error; goto error;
ret = size; ret = in - startin;
error: error:
Py_XDECREF(encoding_obj); Py_XDECREF(encoding_obj);
...@@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page, ...@@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page,
done = 1; done = 1;
} }
/* Skip trailing lead-byte unless 'final' is set */
if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
--chunk_size;
if (chunk_size == 0 && done) { if (chunk_size == 0 && done) {
if (v != NULL) if (v != NULL)
break; break;
_Py_RETURN_UNICODE_EMPTY(); _Py_RETURN_UNICODE_EMPTY();
} }
converted = decode_code_page_strict(code_page, &v, converted = decode_code_page_strict(code_page, &v,
s, chunk_size); s, chunk_size);
if (converted == -2) if (converted == -2)
converted = decode_code_page_errors(code_page, &v, converted = decode_code_page_errors(code_page, &v,
s, chunk_size, s, chunk_size,
errors); errors, final);
assert(converted != 0); assert(converted != 0 || done);
if (converted < 0) { if (converted < 0) {
Py_XDECREF(v); Py_XDECREF(v);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment