Commit 28b21e50 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

1. Non-ASCII bytes were accepted after shift sequence.
2. A low surrogate could be emitted in case of error in high surrogate.
parent 223349cf
...@@ -898,6 +898,32 @@ class CP65001Test(ReadTest, unittest.TestCase): ...@@ -898,6 +898,32 @@ class CP65001Test(ReadTest, unittest.TestCase):
class UTF7Test(ReadTest, unittest.TestCase): class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7" encoding = "utf-7"
def test_ascii(self):
# Set D (directly encoded characters)
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'\'(),-./:?')
self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
# Set O (optional direct characters)
set_o = ' !"#$%&*;<=>@[]^_`{|}'
self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
# +
self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
# White spaces
ws = ' \t\n\r'
self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
# Other ASCII characters
other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
set(set_d + set_o + '+' + ws)))
self.assertEqual(other_ascii.encode(self.encoding),
b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
'a+-b\x00c\x80d\u0100e\U00010000f', 'a+-b\x00c\x80d\u0100e\U00010000f',
...@@ -939,7 +965,9 @@ class UTF7Test(ReadTest, unittest.TestCase): ...@@ -939,7 +965,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
def test_errors(self): def test_errors(self):
tests = [ tests = [
(b'\xffb', '\ufffdb'),
(b'a\xffb', 'a\ufffdb'), (b'a\xffb', 'a\ufffdb'),
(b'a\xff\xffb', 'a\ufffd\ufffdb'),
(b'a+IK', 'a\ufffd'), (b'a+IK', 'a\ufffd'),
(b'a+IK-b', 'a\ufffdb'), (b'a+IK-b', 'a\ufffdb'),
(b'a+IK,b', 'a\ufffdb'), (b'a+IK,b', 'a\ufffdb'),
...@@ -955,6 +983,8 @@ class UTF7Test(ReadTest, unittest.TestCase): ...@@ -955,6 +983,8 @@ class UTF7Test(ReadTest, unittest.TestCase):
(b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
(b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+IKw-b\xff', 'a\u20acb\ufffd'),
(b'a+IKw\xffb', 'a\u20ac\ufffdb'),
] ]
for raw, expected in tests: for raw, expected in tests:
with self.subTest(raw=raw): with self.subTest(raw=raw):
...@@ -966,8 +996,36 @@ class UTF7Test(ReadTest, unittest.TestCase): ...@@ -966,8 +996,36 @@ class UTF7Test(ReadTest, unittest.TestCase):
self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
b'+IKwgrNgB3KA-')
self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
'\u20ac\u20ac\U000104A0')
self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
'\u20ac\u20ac\U000104A0')
test_lone_surrogates = None def test_lone_surrogates(self):
tests = [
(b'a+2AE-b', 'a\ud801b'),
(b'a+2AE\xffb', 'a\ufffdb'),
(b'a+2AE', 'a\ufffd'),
(b'a+2AEA-b', 'a\ufffdb'),
(b'a+2AH-b', 'a\ufffdb'),
(b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
(b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
(b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
(b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
(b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
(b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
(b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
(b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
]
for raw, expected in tests:
with self.subTest(raw=raw):
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
class UTF16ExTest(unittest.TestCase): class UTF16ExTest(unittest.TestCase):
......
...@@ -1524,7 +1524,7 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -1524,7 +1524,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
# Issue #2242: crash on some Windows/MSVC versions # Issue #2242: crash on some Windows/MSVC versions
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
# Direct encoded characters # Direct encoded characters
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
...@@ -1966,6 +1966,7 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -1966,6 +1966,7 @@ class UnicodeTest(string_tests.CommonTest,
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
# Error handling (unknown character names) # Error handling (unknown character names)
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
......
...@@ -10,6 +10,8 @@ Release date: tba ...@@ -10,6 +10,8 @@ Release date: tba
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
- Issue #25280: Import trace messages emitted in verbose (-v) mode are no - Issue #25280: Import trace messages emitted in verbose (-v) mode are no
longer formatted twice. longer formatted twice.
......
...@@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* now leaving a base-64 section */ else { /* now leaving a base-64 section */
inShift = 0; inShift = 0;
s++;
if (surrogate) {
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
goto onError;
surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */ if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) { if (base64bits >= 6) {
/* We've seen at least one base-64 character */ /* We've seen at least one base-64 character */
s++;
errmsg = "partial character in shift sequence"; errmsg = "partial character in shift sequence";
goto utf7Error; goto utf7Error;
} }
else { else {
/* Some bits remain; they should be zero */ /* Some bits remain; they should be zero */
if (base64buffer != 0) { if (base64buffer != 0) {
s++;
errmsg = "non-zero padding bits in shift sequence"; errmsg = "non-zero padding bits in shift sequence";
goto utf7Error; goto utf7Error;
} }
} }
} }
if (ch != '-') { if (surrogate && DECODE_DIRECT(ch)) {
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
goto onError;
}
surrogate = 0;
if (ch == '-') {
/* '-' is absorbed; other terminating /* '-' is absorbed; other terminating
characters are preserved */ characters are preserved */
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) s++;
goto onError;
} }
} }
} }
...@@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* begin base64-encoded section */ else { /* begin base64-encoded section */
inShift = 1; inShift = 1;
surrogate = 0;
shiftOutStart = writer.pos; shiftOutStart = writer.pos;
base64bits = 0; base64bits = 0;
base64buffer = 0; base64buffer = 0;
...@@ -4450,6 +4451,7 @@ utf7Error: ...@@ -4450,6 +4451,7 @@ utf7Error:
if (inShift && !consumed) { /* in shift sequence, no more to follow */ if (inShift && !consumed) { /* in shift sequence, no more to follow */
/* if we're in an inconsistent state, that's an error */ /* if we're in an inconsistent state, that's an error */
inShift = 0;
if (surrogate || if (surrogate ||
(base64bits >= 6) || (base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) { (base64bits > 0 && base64buffer != 0)) {
...@@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, ...@@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
if (maxchar > writer->maxchar || writer->readonly) { if (maxchar > writer->maxchar || writer->readonly) {
/* resize + widen */ /* resize + widen */
maxchar = Py_MAX(maxchar, writer->maxchar);
newbuffer = PyUnicode_New(newlen, maxchar); newbuffer = PyUnicode_New(newlen, maxchar);
if (newbuffer == NULL) if (newbuffer == NULL)
return -1; return -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment