Commit d267ac20 authored by Victor Stinner's avatar Victor Stinner Committed by GitHub

bpo-36778: cp65001 encoding becomes an alias to utf_8 (GH-13230)

parent 137be341
......@@ -1106,8 +1106,7 @@ particular, the following variants typically exist:
+-----------------+--------------------------------+--------------------------------+
| cp1258 | windows-1258 | Vietnamese |
+-----------------+--------------------------------+--------------------------------+
| cp65001 | | Windows only: Windows UTF-8 |
| | | (``CP_UTF8``) |
| cp65001 | | Alias to ``utf_8`` encoding |
| | | |
| | | .. versionadded:: 3.3 |
+-----------------+--------------------------------+--------------------------------+
......
......@@ -534,6 +534,7 @@ aliases = {
'utf8' : 'utf_8',
'utf8_ucs2' : 'utf_8',
'utf8_ucs4' : 'utf_8',
'cp65001' : 'utf_8',
# uu_codec codec
'uu' : 'uu_codec',
......
"""
Code page 65001: Windows UTF-8 (CP_UTF8).
"""
import codecs
import functools
if not hasattr(codecs, 'code_page_encode'):
raise LookupError("cp65001 encoding is only available on Windows")
### Codec APIs
encode = functools.partial(codecs.code_page_encode, 65001)
_decode = functools.partial(codecs.code_page_decode, 65001)
def decode(input, errors='strict'):
return codecs.code_page_decode(65001, input, errors, True)
class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return encode(input, self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
_buffer_decode = _decode
class StreamWriter(codecs.StreamWriter):
encode = encode
class StreamReader(codecs.StreamReader):
decode = _decode
### encodings module API
def getregentry():
return codecs.CodecInfo(
name='cp65001',
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)
......@@ -875,95 +875,6 @@ class UTF8Test(ReadTest, unittest.TestCase):
b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
@unittest.skipUnless(sys.platform == 'win32',
'cp65001 is a Windows-only codec')
class CP65001Test(ReadTest, unittest.TestCase):
encoding = "cp65001"
def test_encode(self):
tests = [
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
('\udc80', 'strict', None),
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
]
for text, errors, expected in tests:
if expected is not None:
try:
encoded = text.encode('cp65001', errors)
except UnicodeEncodeError as err:
self.fail('Unable to encode %a to cp65001 with '
'errors=%r: %s' % (text, errors, err))
self.assertEqual(encoded, expected,
'%a.encode("cp65001", %r)=%a != %a'
% (text, errors, encoded, expected))
else:
self.assertRaises(UnicodeEncodeError,
text.encode, "cp65001", errors)
def test_decode(self):
tests = [
(b'abc', 'strict', 'abc'),
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
(b'\xef\xbf\xbd', 'strict', '\ufffd'),
(b'[\xc3\xa9]', 'strict', '[\xe9]'),
# invalid bytes
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
(b'[\xff]', 'surrogateescape', '[\udcff]'),
(b'[\xed\xb2\x80]', 'strict', None),
(b'[\xed\xb2\x80]', 'ignore', '[]'),
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
]
for raw, errors, expected in tests:
if expected is not None:
try:
decoded = raw.decode('cp65001', errors)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from cp65001 with '
'errors=%r: %s' % (raw, errors, err))
self.assertEqual(decoded, expected,
'%a.decode("cp65001", %r)=%a != %a'
% (raw, errors, decoded, expected))
else:
self.assertRaises(UnicodeDecodeError,
raw.decode, 'cp65001', errors)
def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
b'[�]')
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
b'[\x80]')
self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
b'[]')
self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
b'[?]')
def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
b"abc\xed\xa0\x80def")
self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
"abc\ud800def")
self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
b"\xf0\x90\xbf\xbf\xed\xa0\x80")
self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
"\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))
class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"
......
``cp65001`` encoding (Windows code page 65001) becomes an alias to ``utf_8``
encoding.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment