Commit 2cded9c3 authored by Victor Stinner's avatar Victor Stinner

Issue #12016: Multibyte CJK decoders now resynchronize faster

They only ignore the first byte of an invalid byte sequence.

For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of
'\ufffd'.
parent 081fe46f
...@@ -68,6 +68,29 @@ New, Improved, and Deprecated Modules ...@@ -68,6 +68,29 @@ New, Improved, and Deprecated Modules
* Stub * Stub
codecs
------
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',
'replace') gives '�\n' instead of '�'.
(http://bugs.python.org/issue12016)
Don't reset incremental encoders of CJK codecs at each call to their encode()
method anymore. For example: ::
$ ./python -q
>>> import codecs
>>> encoder = codecs.getincrementalencoder('hz')('strict')
>>> b''.join(encoder.encode(x) for x in '\u52ff\u65bd\u65bc\u4eba\u3002 Bye.')
b'~{NpJ)l6HK!#~} Bye.'
This example gives b'~{Np~}~{J)~}~{l6~}~{HK~}~{!#~} Bye.' with older Python
versions.
(http://bugs.python.org/issue12100)
faulthandler faulthandler
------------ ------------
......
...@@ -15,8 +15,8 @@ class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -15,8 +15,8 @@ class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x81\x81\xc1\xc4", "strict", None), (b"abc\x81\x81\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\u804a"), (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
(b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
(b"abc\x81\x81\xc1\xc4", "ignore", "abc\u804a"), (b"abc\x81\x81\xc1\xc4", "ignore", "abc\u804a"),
(b"\xc1\x64", "strict", None), (b"\xc1\x64", "strict", None),
) )
...@@ -28,8 +28,8 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -28,8 +28,8 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"),
(b"\x83\x34\x83\x31", "strict", None), (b"\x83\x34\x83\x31", "strict", None),
("\u30fb", "strict", None), ("\u30fb", "strict", None),
...@@ -42,11 +42,14 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -42,11 +42,14 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u804a"),
(b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd\u804a"), (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
("\u30fb", "strict", b"\x819\xa79"), ("\u30fb", "strict", b"\x819\xa79"),
(b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
(b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
(b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
) )
has_iso10646 = True has_iso10646 = True
...@@ -74,9 +77,11 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -74,9 +77,11 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):
'\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002' '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
'Bye.\n'), 'Bye.\n'),
# invalid bytes # invalid bytes
(b'ab~cd', 'replace', 'ab\uFFFDd'), (b'ab~cd', 'replace', 'ab\uFFFDcd'),
(b'ab\xffcd', 'replace', 'ab\uFFFDcd'), (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
(b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'), (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
(b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
(b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
) )
def test_main(): def test_main():
......
...@@ -15,8 +15,8 @@ class Test_Big5HKSCS(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -15,8 +15,8 @@ class Test_Big5HKSCS(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"),
) )
......
...@@ -15,50 +15,57 @@ class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -15,50 +15,57 @@ class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x81\x00\x81\x00\x82\x84", "strict", None), (b"abc\x81\x00\x81\x00\x82\x84", "strict", None),
(b"abc\xf8", "strict", None), (b"abc\xf8", "strict", None),
(b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\uff44"), (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\x00\uff44"),
(b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"), (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\x00\uff44\ufffd"),
(b"abc\x81\x00\x82\x84", "ignore", "abc\uff44"), (b"abc\x81\x00\x82\x84", "ignore", "abc\x00\uff44"),
(b"ab\xEBxy", "replace", "ab\uFFFDxy"),
(b"ab\xF0\x39xy", "replace", "ab\uFFFD9xy"),
(b"ab\xEA\xF0xy", "replace", 'ab\ufffd\ue038y'),
# sjis vs cp932 # sjis vs cp932
(b"\\\x7e", "replace", "\\\x7e"), (b"\\\x7e", "replace", "\\\x7e"),
(b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"), (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"),
) )
euc_commontests = (
# invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u7956"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u7956\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"),
(b"abc\xc8", "strict", None),
(b"abc\x8f\x83\x83", "replace", "abc\ufffd\ufffd\ufffd"),
(b"\x82\xFCxy", "replace", "\ufffd\ufffdxy"),
(b"\xc1\x64", "strict", None),
(b"\xa1\xc0", "strict", "\uff3c"),
(b"\xa1\xc0\\", "strict", "\uff3c\\"),
(b"\x8eXY", "replace", "\ufffdXY"),
)
class Test_EUC_JIS_2004(test_multibytecodec_support.TestBase,
unittest.TestCase):
encoding = 'euc_jis_2004'
tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
codectests = euc_commontests
xmlcharnametest = (
"\xab\u211c\xbb = \u2329\u1234\u232a",
b"\xa9\xa8ℜ\xa9\xb2 = ⟨ሴ⟩"
)
class Test_EUC_JISX0213(test_multibytecodec_support.TestBase, class Test_EUC_JISX0213(test_multibytecodec_support.TestBase,
unittest.TestCase): unittest.TestCase):
encoding = 'euc_jisx0213' encoding = 'euc_jisx0213'
tstring = test_multibytecodec_support.load_teststring('euc_jisx0213') tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
codectests = ( codectests = euc_commontests
# invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"),
(b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
(b"\xc1\x64", "strict", None),
(b"\xa1\xc0", "strict", "\uff3c"),
)
xmlcharnametest = ( xmlcharnametest = (
"\xab\u211c\xbb = \u2329\u1234\u232a", "\xab\u211c\xbb = \u2329\u1234\u232a",
b"\xa9\xa8ℜ\xa9\xb2 = ⟨ሴ⟩" b"\xa9\xa8ℜ\xa9\xb2 = ⟨ሴ⟩"
) )
eucjp_commontests = (
(b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u7956"),
(b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
(b"\xc1\x64", "strict", None),
)
class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase, class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
unittest.TestCase): unittest.TestCase):
encoding = 'euc_jp' encoding = 'euc_jp'
tstring = test_multibytecodec_support.load_teststring('euc_jp') tstring = test_multibytecodec_support.load_teststring('euc_jp')
codectests = eucjp_commontests + ( codectests = euc_commontests + (
(b"\xa1\xc0\\", "strict", "\uff3c\\"),
("\xa5", "strict", b"\x5c"), ("\xa5", "strict", b"\x5c"),
("\u203e", "strict", b"\x7e"), ("\u203e", "strict", b"\x7e"),
) )
...@@ -66,8 +73,6 @@ class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase, ...@@ -66,8 +73,6 @@ class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
shiftjis_commonenctests = ( shiftjis_commonenctests = (
(b"abc\x80\x80\x82\x84", "strict", None), (b"abc\x80\x80\x82\x84", "strict", None),
(b"abc\xf8", "strict", None), (b"abc\xf8", "strict", None),
(b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
(b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
(b"abc\x80\x80\x82\x84def", "ignore", "abc\uff44def"), (b"abc\x80\x80\x82\x84def", "ignore", "abc\uff44def"),
) )
...@@ -75,20 +80,41 @@ class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -75,20 +80,41 @@ class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jis' encoding = 'shift_jis'
tstring = test_multibytecodec_support.load_teststring('shift_jis') tstring = test_multibytecodec_support.load_teststring('shift_jis')
codectests = shiftjis_commonenctests + ( codectests = shiftjis_commonenctests + (
(b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
(b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
(b"\\\x7e", "strict", "\\\x7e"), (b"\\\x7e", "strict", "\\\x7e"),
(b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"), (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"),
(b"abc\x81\x39", "replace", "abc\ufffd9"),
(b"abc\xEA\xFC", "replace", "abc\ufffd\ufffd"),
(b"abc\xFF\x58", "replace", "abc\ufffdX"),
)
class Test_SJIS_2004(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jis_2004'
tstring = test_multibytecodec_support.load_teststring('shift_jis')
codectests = shiftjis_commonenctests + (
(b"\\\x7e", "strict", "\xa5\u203e"),
(b"\x81\x5f\x81\x61\x81\x7c", "strict", "\\\u2016\u2212"),
(b"abc\xEA\xFC", "strict", "abc\u64bf"),
(b"\x81\x39xy", "replace", "\ufffd9xy"),
(b"\xFF\x58xy", "replace", "\ufffdXxy"),
(b"\x80\x80\x82\x84xy", "replace", "\ufffd\ufffd\uff44xy"),
(b"\x80\x80\x82\x84\x88xy", "replace", "\ufffd\ufffd\uff44\u5864y"),
(b"\xFC\xFBxy", "replace", '\ufffd\u95b4y'),
)
xmlcharnametest = (
"\xab\u211c\xbb = \u2329\u1234\u232a",
b"\x85Gℜ\x85Q = ⟨ሴ⟩"
) )
class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase): class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase):
encoding = 'shift_jisx0213' encoding = 'shift_jisx0213'
tstring = test_multibytecodec_support.load_teststring('shift_jisx0213') tstring = test_multibytecodec_support.load_teststring('shift_jisx0213')
codectests = ( codectests = shiftjis_commonenctests + (
# invalid bytes (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
(b"abc\x80\x80\x82\x84", "strict", None), (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
(b"abc\xf8", "strict", None),
(b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
(b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
(b"abc\x80\x80\x82\x84def", "ignore", "abc\uff44def"),
# sjis vs cp932 # sjis vs cp932
(b"\\\x7e", "replace", "\xa5\u203e"), (b"\\\x7e", "replace", "\xa5\u203e"),
(b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"), (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"),
......
...@@ -15,8 +15,8 @@ class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -15,8 +15,8 @@ class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\uc894"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"),
) )
...@@ -27,8 +27,8 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -27,8 +27,8 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"), (b"abc\x80\x80\xc1\xc4", "replace", 'abc\ufffd\ufffd\uc894'),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\uc894"),
# composed make-up sequence errors # composed make-up sequence errors
...@@ -40,13 +40,14 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -40,13 +40,14 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
(b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
(b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"),
(b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"),
(b"a\xa4\xd4\xa4\xb6\xa4", "replace", "a\ufffd"), (b"a\xa4\xd4\xa4\xb6\xa4", "replace", 'a\ufffd'),
(b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None), (b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
(b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
(b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
(b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", "\ufffd"), (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", '\ufffd\u6e21\ufffd\u3160\ufffd'),
(b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", "\ufffd"), (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", '\ufffd\u6e21\ub544\ufffd\ufffd'),
(b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", "\ufffd"), (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", '\ufffd\u6e21\ub544\u572d\ufffd'),
(b"\xa4\xd4\xff\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "replace", '\ufffd\ufffd\ufffd\uc4d4'),
(b"\xc1\xc4", "strict", "\uc894"), (b"\xc1\xc4", "strict", "\uc894"),
) )
...@@ -57,9 +58,13 @@ class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -57,9 +58,13 @@ class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ucd27"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\ucd27"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ucd27\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\ucd27\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\ucd27"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\ucd27"),
(b"\xD8abc", "replace", "\uFFFDabc"),
(b"\xD8\xFFabc", "replace", "\uFFFD\uFFFDabc"),
(b"\x84bxy", "replace", "\uFFFDbxy"),
(b"\x8CBxy", "replace", "\uFFFDBxy"),
) )
def test_main(): def test_main():
......
...@@ -15,8 +15,8 @@ class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase): ...@@ -15,8 +15,8 @@ class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase):
# invalid bytes # invalid bytes
(b"abc\x80\x80\xc1\xc4", "strict", None), (b"abc\x80\x80\xc1\xc4", "strict", None),
(b"abc\xc8", "strict", None), (b"abc\xc8", "strict", None),
(b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"), (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
(b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"), (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
(b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"), (b"abc\x80\x80\xc1\xc4", "ignore", "abc\u8b10"),
) )
......
...@@ -23,6 +23,9 @@ class TestCP950Map(test_multibytecodec_support.TestBase_Mapping, ...@@ -23,6 +23,9 @@ class TestCP950Map(test_multibytecodec_support.TestBase_Mapping,
(b'\xa2\xcc', '\u5341'), (b'\xa2\xcc', '\u5341'),
(b'\xa2\xce', '\u5345'), (b'\xa2\xce', '\u5345'),
] ]
codectests = (
(b"\xFFxy", "replace", "\ufffdxy"),
)
def test_main(): def test_main():
support.run_unittest(__name__) support.run_unittest(__name__)
......
...@@ -219,6 +219,10 @@ Core and Builtins ...@@ -219,6 +219,10 @@ Core and Builtins
Library Library
------- -------
- Issue #12016: Multibyte CJK decoders now resynchronize faster. They only
ignore the first byte of an invalid byte sequence. For example,
b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
- Issue #12459: time.sleep() now raises a ValueError if the sleep length is - Issue #12459: time.sleep() now raises a ValueError if the sleep length is
negative, instead of an infinite sleep on Windows or raising an IOError on negative, instead of an infinite sleep on Windows or raising an IOError on
Linux for example, to have the same behaviour on all platforms. Linux for example, to have the same behaviour on all platforms.
......
...@@ -85,7 +85,7 @@ DECODER(gb2312) ...@@ -85,7 +85,7 @@ DECODER(gb2312)
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1) NEXT(2, 1)
} }
else return 2; else return 1;
} }
return 0; return 0;
...@@ -141,7 +141,7 @@ DECODER(gbk) ...@@ -141,7 +141,7 @@ DECODER(gbk)
REQUIRE_INBUF(2) REQUIRE_INBUF(2)
GBK_DECODE(c, IN2, **outbuf) GBK_DECODE(c, IN2, **outbuf)
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
...@@ -267,7 +267,7 @@ DECODER(gb18030) ...@@ -267,7 +267,7 @@ DECODER(gb18030)
c3 = IN3; c3 = IN3;
c4 = IN4; c4 = IN4;
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
return 4; return 1;
c -= 0x81; c2 -= 0x30; c -= 0x81; c2 -= 0x30;
c3 -= 0x81; c4 -= 0x30; c3 -= 0x81; c4 -= 0x30;
...@@ -292,12 +292,12 @@ DECODER(gb18030) ...@@ -292,12 +292,12 @@ DECODER(gb18030)
continue; continue;
} }
} }
return 4; return 1;
} }
GBK_DECODE(c, c2, **outbuf) GBK_DECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
...@@ -400,7 +400,7 @@ DECODER(hz) ...@@ -400,7 +400,7 @@ DECODER(hz)
else if (c2 == '\n') else if (c2 == '\n')
; /* line-continuation */ ; /* line-continuation */
else else
return 2; return 1;
NEXT(2, 0); NEXT(2, 0);
continue; continue;
} }
...@@ -419,7 +419,7 @@ DECODER(hz) ...@@ -419,7 +419,7 @@ DECODER(hz)
NEXT(2, 1) NEXT(2, 1)
} }
else else
return 2; return 1;
} }
} }
......
...@@ -161,7 +161,7 @@ DECODER(big5hkscs) ...@@ -161,7 +161,7 @@ DECODER(big5hkscs)
case 0x8864: WRITE2(0x00ca, 0x030c); break; case 0x8864: WRITE2(0x00ca, 0x030c); break;
case 0x88a3: WRITE2(0x00ea, 0x0304); break; case 0x88a3: WRITE2(0x00ea, 0x0304); break;
case 0x88a5: WRITE2(0x00ea, 0x030c); break; case 0x88a5: WRITE2(0x00ea, 0x030c); break;
default: return 2; default: return 1;
} }
NEXT(2, 2) /* all decoded codepoints are pairs, above. */ NEXT(2, 2) /* all decoded codepoints are pairs, above. */
......
...@@ -112,7 +112,7 @@ DECODER(cp932) ...@@ -112,7 +112,7 @@ DECODER(cp932)
TRYMAP_DEC(cp932ext, **outbuf, c, c2); TRYMAP_DEC(cp932ext, **outbuf, c, c2);
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2; return 1;
c = (c < 0xe0 ? c - 0x81 : c - 0xc1); c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
...@@ -120,7 +120,7 @@ DECODER(cp932) ...@@ -120,7 +120,7 @@ DECODER(cp932)
c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
TRYMAP_DEC(jisx0208, **outbuf, c, c2); TRYMAP_DEC(jisx0208, **outbuf, c, c2);
else return 2; else return 1;
} }
else if (c >= 0xf0 && c <= 0xf9) { else if (c >= 0xf0 && c <= 0xf9) {
if ((c2 >= 0x40 && c2 <= 0x7e) || if ((c2 >= 0x40 && c2 <= 0x7e) ||
...@@ -128,10 +128,10 @@ DECODER(cp932) ...@@ -128,10 +128,10 @@ DECODER(cp932)
OUT1(0xe000 + 188 * (c - 0xf0) + OUT1(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)) (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
else else
return 2; return 1;
} }
else else
return 2; return 1;
NEXT(2, 1) NEXT(2, 1)
} }
...@@ -256,7 +256,7 @@ DECODER(euc_jis_2004) ...@@ -256,7 +256,7 @@ DECODER(euc_jis_2004)
NEXT(2, 1) NEXT(2, 1)
} }
else else
return 2; return 1;
} }
else if (c == 0x8f) { else if (c == 0x8f) {
unsigned char c2, c3; unsigned char c2, c3;
...@@ -274,7 +274,7 @@ DECODER(euc_jis_2004) ...@@ -274,7 +274,7 @@ DECODER(euc_jis_2004)
continue; continue;
} }
else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ; else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
else return 3; else return 1;
NEXT(3, 1) NEXT(3, 1)
} }
else { else {
...@@ -300,7 +300,7 @@ DECODER(euc_jis_2004) ...@@ -300,7 +300,7 @@ DECODER(euc_jis_2004)
NEXT(2, 2) NEXT(2, 2)
continue; continue;
} }
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
} }
...@@ -388,7 +388,7 @@ DECODER(euc_jp) ...@@ -388,7 +388,7 @@ DECODER(euc_jp)
NEXT(2, 1) NEXT(2, 1)
} }
else else
return 2; return 1;
} }
else if (c == 0x8f) { else if (c == 0x8f) {
unsigned char c2, c3; unsigned char c2, c3;
...@@ -401,7 +401,7 @@ DECODER(euc_jp) ...@@ -401,7 +401,7 @@ DECODER(euc_jp)
NEXT(3, 1) NEXT(3, 1)
} }
else else
return 3; return 1;
} }
else { else {
unsigned char c2; unsigned char c2;
...@@ -417,7 +417,7 @@ DECODER(euc_jp) ...@@ -417,7 +417,7 @@ DECODER(euc_jp)
#endif #endif
TRYMAP_DEC(jisx0208, **outbuf, TRYMAP_DEC(jisx0208, **outbuf,
c ^ 0x80, c2 ^ 0x80) ; c ^ 0x80, c2 ^ 0x80) ;
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
} }
...@@ -502,7 +502,7 @@ DECODER(shift_jis) ...@@ -502,7 +502,7 @@ DECODER(shift_jis)
REQUIRE_INBUF(2) REQUIRE_INBUF(2)
c2 = IN2; c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2; return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
...@@ -522,10 +522,10 @@ DECODER(shift_jis) ...@@ -522,10 +522,10 @@ DECODER(shift_jis)
continue; continue;
} }
else else
return 2; return 1;
} }
else else
return 2; return 1;
NEXT(1, 1) /* JIS X 0201 */ NEXT(1, 1) /* JIS X 0201 */
} }
...@@ -645,7 +645,7 @@ DECODER(shift_jis_2004) ...@@ -645,7 +645,7 @@ DECODER(shift_jis_2004)
REQUIRE_INBUF(2) REQUIRE_INBUF(2)
c2 = IN2; c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
return 2; return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
...@@ -671,7 +671,7 @@ DECODER(shift_jis_2004) ...@@ -671,7 +671,7 @@ DECODER(shift_jis_2004)
NEXT_OUT(2) NEXT_OUT(2)
} }
else else
return 2; return 1;
NEXT_IN(2) NEXT_IN(2)
} }
else { /* Plane 2 */ else { /* Plane 2 */
...@@ -689,13 +689,13 @@ DECODER(shift_jis_2004) ...@@ -689,13 +689,13 @@ DECODER(shift_jis_2004)
continue; continue;
} }
else else
return 2; return 1;
NEXT(2, 1) NEXT(2, 1)
} }
continue; continue;
} }
else else
return 2; return 1;
NEXT(1, 1) /* JIS X 0201 */ NEXT(1, 1) /* JIS X 0201 */
} }
......
...@@ -123,7 +123,7 @@ DECODER(euc_kr) ...@@ -123,7 +123,7 @@ DECODER(euc_kr)
if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE || if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE || (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE) (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
return 8; return 1;
c = (*inbuf)[3]; c = (*inbuf)[3];
if (0xa1 <= c && c <= 0xbe) if (0xa1 <= c && c <= 0xbe)
...@@ -143,7 +143,7 @@ DECODER(euc_kr) ...@@ -143,7 +143,7 @@ DECODER(euc_kr)
jong = NONE; jong = NONE;
if (cho == NONE || jung == NONE || jong == NONE) if (cho == NONE || jung == NONE || jong == NONE)
return 8; return 1;
OUT1(0xac00 + cho*588 + jung*28 + jong); OUT1(0xac00 + cho*588 + jung*28 + jong);
NEXT(8, 1) NEXT(8, 1)
...@@ -152,7 +152,7 @@ DECODER(euc_kr) ...@@ -152,7 +152,7 @@ DECODER(euc_kr)
NEXT(2, 1) NEXT(2, 1)
} }
else else
return 2; return 1;
} }
return 0; return 0;
...@@ -208,7 +208,7 @@ DECODER(cp949) ...@@ -208,7 +208,7 @@ DECODER(cp949)
REQUIRE_INBUF(2) REQUIRE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80); TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, **outbuf, c, IN2); else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
...@@ -375,7 +375,7 @@ DECODER(johab) ...@@ -375,7 +375,7 @@ DECODER(johab)
i_jong = johabidx_jongseong[c_jong]; i_jong = johabidx_jongseong[c_jong];
if (i_cho == NONE || i_jung == NONE || i_jong == NONE) if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
return 2; return 1;
/* we don't use U+1100 hangul jamo yet. */ /* we don't use U+1100 hangul jamo yet. */
if (i_cho == FILL) { if (i_cho == FILL) {
...@@ -391,7 +391,7 @@ DECODER(johab) ...@@ -391,7 +391,7 @@ DECODER(johab)
OUT1(0x3100 | OUT1(0x3100 |
johabjamo_jungseong[c_jung]) johabjamo_jungseong[c_jung])
else else
return 2; return 1;
} }
} else { } else {
if (i_jung == FILL) { if (i_jung == FILL) {
...@@ -399,7 +399,7 @@ DECODER(johab) ...@@ -399,7 +399,7 @@ DECODER(johab)
OUT1(0x3100 | OUT1(0x3100 |
johabjamo_choseong[c_cho]) johabjamo_choseong[c_cho])
else else
return 2; return 1;
} }
else else
OUT1(0xac00 + OUT1(0xac00 +
...@@ -414,7 +414,7 @@ DECODER(johab) ...@@ -414,7 +414,7 @@ DECODER(johab)
c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) || c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
(c2 & 0x7f) == 0x7f || (c2 & 0x7f) == 0x7f ||
(c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3))) (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
return 2; return 1;
else { else {
unsigned char t1, t2; unsigned char t1, t2;
...@@ -425,7 +425,7 @@ DECODER(johab) ...@@ -425,7 +425,7 @@ DECODER(johab)
t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21; t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
TRYMAP_DEC(ksx1001, **outbuf, t1, t2); TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
} }
......
...@@ -55,7 +55,7 @@ DECODER(big5) ...@@ -55,7 +55,7 @@ DECODER(big5)
TRYMAP_DEC(big5, **outbuf, c, IN2) { TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1) NEXT(2, 1)
} }
else return 2; else return 1;
} }
return 0; return 0;
...@@ -109,7 +109,7 @@ DECODER(cp950) ...@@ -109,7 +109,7 @@ DECODER(cp950)
TRYMAP_DEC(cp950ext, **outbuf, c, IN2); TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
else TRYMAP_DEC(big5, **outbuf, c, IN2); else TRYMAP_DEC(big5, **outbuf, c, IN2);
else return 2; else return 1;
NEXT(2, 1) NEXT(2, 1)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment