Commit c3713e97 authored by Victor Stinner's avatar Victor Stinner

Optimize ascii/latin1+surrogateescape encoders

Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast.

Initial patch written by Serhiy Storchaka.
parent 5fbeabcb
...@@ -117,6 +117,9 @@ Optimizations ...@@ -117,6 +117,9 @@ Optimizations
* The ASCII decoder is now up to 60 times as fast for error handlers: * The ASCII decoder is now up to 60 times as fast for error handlers:
``surrogateescape``, ``ignore`` and ``replace``. ``surrogateescape``, ``ignore`` and ``replace``.
* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
error ``surrogateescape``.
Build and C API Changes Build and C API Changes
======================= =======================
......
...@@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase): ...@@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
class ASCIITest(unittest.TestCase): class ASCIITest(unittest.TestCase):
def test_encode(self):
self.assertEqual('abc123'.encode('ascii'), b'abc123')
def test_encode_error(self):
for data, error_handler, expected in (
('[\x80\xff\u20ac]', 'ignore', b'[]'),
('[\x80\xff\u20ac]', 'replace', b'[???]'),
('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('ascii', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\xff'.encode('ascii', 'surrogateescape')
def test_decode(self): def test_decode(self):
self.assertEqual(b'abc'.decode('ascii'), 'abc')
def test_decode_error(self):
for data, error_handler, expected in ( for data, error_handler, expected in (
(b'[\x80\xff]', 'ignore', '[]'), (b'[\x80\xff]', 'ignore', '[]'),
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
...@@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase): ...@@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
expected) expected)
class Latin1Test(unittest.TestCase):
def test_encode(self):
for data, expected in (
('abc', b'abc'),
('\x80\xe9\xff', b'\x80\xe9\xff'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.encode('latin1'), expected)
def test_encode_errors(self):
for data, error_handler, expected in (
('[\u20ac\udc80]', 'ignore', b'[]'),
('[\u20ac\udc80]', 'replace', b'[??]'),
('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
):
with self.subTest(data=data, error_handler=error_handler,
expected=expected):
self.assertEqual(data.encode('latin1', error_handler),
expected)
def test_encode_surrogateescape_error(self):
with self.assertRaises(UnicodeEncodeError):
# the first character can be decoded, but not the second
'\udc80\u20ac'.encode('latin1', 'surrogateescape')
def test_decode(self):
for data, expected in (
(b'abc', 'abc'),
(b'[\x80\xff]', '[\x80\xff]'),
):
with self.subTest(data=data, expected=expected):
self.assertEqual(data.decode('latin1'), expected)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -10,6 +10,10 @@ Release date: XXXX-XX-XX ...@@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
error handler: the encoders are now up to 3 times as fast. Initial patch
written by Serhiy Storchaka.
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the - Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
getrandom() function instead of the getentropy() function. The getentropy() getrandom() function instead of the getentropy() function. The getentropy()
function is blocking to generate very good quality entropy, os.urandom() function is blocking to generate very good quality entropy, os.urandom()
......
...@@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend; pos = collend;
break; break;
case _Py_ERROR_SURROGATEESCAPE:
for (i = collstart; i < collend; ++i) {
ch = PyUnicode_READ(kind, data, i);
if (ch < 0xdc80 || 0xdcff < ch) {
/* Not a UTF-8b surrogate */
break;
}
*str++ = (char)(ch - 0xdc00);
++pos;
}
if (i >= collend)
break;
collstart = pos;
assert(collstart != collend);
/* fallback to general error handling */
default: default:
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc, encoding, reason, unicode, &exc,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment