Commit e12f6321 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:

1. Non-ASCII bytes were accepted after shift sequence.
2. A low surrogate could be emitted in case of error in high surrogate.
parent a87633e5
...@@ -642,6 +642,32 @@ class UTF8Test(ReadTest): ...@@ -642,6 +642,32 @@ class UTF8Test(ReadTest):
class UTF7Test(ReadTest): class UTF7Test(ReadTest):
encoding = "utf-7" encoding = "utf-7"
def test_ascii(self):
# Set D (directly encoded characters)
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'\'(),-./:?')
self.assertEqual(set_d.encode(self.encoding), set_d)
self.assertEqual(set_d.decode(self.encoding), set_d)
# Set O (optional direct characters)
set_o = ' !"#$%&*;<=>@[]^_`{|}'
self.assertEqual(set_o.encode(self.encoding), set_o)
self.assertEqual(set_o.decode(self.encoding), set_o)
# +
self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
# White spaces
ws = ' \t\n\r'
self.assertEqual(ws.encode(self.encoding), ws)
self.assertEqual(ws.decode(self.encoding), ws)
# Other ASCII characters
other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
set(set_d + set_o + '+' + ws)))
self.assertEqual(other_ascii.encode(self.encoding),
'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
def test_partial(self): def test_partial(self):
self.check_partial( self.check_partial(
u"a+-b", u"a+-b",
...@@ -656,7 +682,9 @@ class UTF7Test(ReadTest): ...@@ -656,7 +682,9 @@ class UTF7Test(ReadTest):
def test_errors(self): def test_errors(self):
tests = [ tests = [
('\xffb', u'\ufffdb'),
('a\xffb', u'a\ufffdb'), ('a\xffb', u'a\ufffdb'),
('a\xff\xffb', u'a\ufffd\ufffdb'),
('a+IK', u'a\ufffd'), ('a+IK', u'a\ufffd'),
('a+IK-b', u'a\ufffdb'), ('a+IK-b', u'a\ufffdb'),
('a+IK,b', u'a\ufffdb'), ('a+IK,b', u'a\ufffdb'),
...@@ -672,6 +700,8 @@ class UTF7Test(ReadTest): ...@@ -672,6 +700,8 @@ class UTF7Test(ReadTest):
('a+//,+IKw-b', u'a\ufffd\u20acb'), ('a+//,+IKw-b', u'a\ufffd\u20acb'),
('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'), ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'), ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
('a+IKw-b\xff', u'a\u20acb\ufffd'),
('a+IKw\xffb', u'a\u20ac\ufffdb'),
] ]
for raw, expected in tests: for raw, expected in tests:
self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode, self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
...@@ -682,6 +712,35 @@ class UTF7Test(ReadTest): ...@@ -682,6 +712,35 @@ class UTF7Test(ReadTest):
self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-') self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-') self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0') self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
'+IKwgrNgB3KA-')
self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
u'\u20ac\u20ac\U000104A0')
self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
u'\u20ac\u20ac\U000104A0')
def test_lone_surrogates(self):
tests = [
('a+2AE-b', u'a\ud801b'),
('a+2AE\xffb', u'a\ufffdb'),
('a+2AE', u'a\ufffd'),
('a+2AEA-b', u'a\ufffdb'),
('a+2AH-b', u'a\ufffdb'),
('a+IKzYAQ-b', u'a\u20ac\ud801b'),
('a+IKzYAQ\xffb', u'a\u20ac\ufffdb'),
('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
('a+IKzYAd-b', u'a\u20ac\ufffdb'),
('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
('a+IKwgrNgB\xffb', u'a\u20ac\u20ac\ufffdb'),
('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
]
for raw, expected in tests:
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
class UTF16ExTest(unittest.TestCase): class UTF16ExTest(unittest.TestCase):
......
...@@ -1036,6 +1036,7 @@ class UnicodeTest( ...@@ -1036,6 +1036,7 @@ class UnicodeTest(
self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict') self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
self.assertEqual(u'abcde'.decode('ascii', 'ignore'), self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
u'abcde'.decode('ascii', errors='ignore')) u'abcde'.decode('ascii', errors='ignore'))
self.assertEqual(u'abcde'.decode('ascii', 'replace'), self.assertEqual(u'abcde'.decode('ascii', 'replace'),
......
...@@ -10,6 +10,8 @@ What's New in Python 2.7.11? ...@@ -10,6 +10,8 @@ What's New in Python 2.7.11?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
- Issue #25003: os.urandom() doesn't use getentropy() on Solaris because - Issue #25003: os.urandom() doesn't use getentropy() on Solaris because
getentropy() is blocking, whereas os.urandom() should not block. getentropy() getentropy() is blocking, whereas os.urandom() should not block. getentropy()
is supported since Solaris 11.3. is supported since Solaris 11.3.
......
...@@ -1716,29 +1716,29 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -1716,29 +1716,29 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* now leaving a base-64 section */ else { /* now leaving a base-64 section */
inShift = 0; inShift = 0;
s++;
if (surrogate) {
*p++ = surrogate;
surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */ if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) { if (base64bits >= 6) {
/* We've seen at least one base-64 character */ /* We've seen at least one base-64 character */
s++;
errmsg = "partial character in shift sequence"; errmsg = "partial character in shift sequence";
goto utf7Error; goto utf7Error;
} }
else { else {
/* Some bits remain; they should be zero */ /* Some bits remain; they should be zero */
if (base64buffer != 0) { if (base64buffer != 0) {
s++;
errmsg = "non-zero padding bits in shift sequence"; errmsg = "non-zero padding bits in shift sequence";
goto utf7Error; goto utf7Error;
} }
} }
} }
if (ch != '-') { if (surrogate && DECODE_DIRECT(ch))
*p++ = surrogate;
surrogate = 0;
if (ch == '-') {
/* '-' is absorbed; other terminating /* '-' is absorbed; other terminating
characters are preserved */ characters are preserved */
*p++ = ch; s++;
} }
} }
} }
...@@ -1751,6 +1751,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -1751,6 +1751,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
} }
else { /* begin base64-encoded section */ else { /* begin base64-encoded section */
inShift = 1; inShift = 1;
surrogate = 0;
shiftOutStart = p; shiftOutStart = p;
base64bits = 0; base64bits = 0;
base64buffer = 0; base64buffer = 0;
...@@ -1782,6 +1783,7 @@ utf7Error: ...@@ -1782,6 +1783,7 @@ utf7Error:
if (inShift && !consumed) { /* in shift sequence, no more to follow */ if (inShift && !consumed) { /* in shift sequence, no more to follow */
/* if we're in an inconsistent state, that's an error */ /* if we're in an inconsistent state, that's an error */
inShift = 0;
if (surrogate || if (surrogate ||
(base64bits >= 6) || (base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) { (base64bits > 0 && base64buffer != 0)) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment