Commit 66c4f3f3 authored by Abhilash Raj's avatar Abhilash Raj Committed by Barry Warsaw

bpo-21315: Fix parsing of encoded words with missing leading ws. (#13425)

* bpo-21315: Fix parsing of encoded words with missing leading ws.

Because of missing leading whitespace, encoded word would get parsed as
unstructured token. This patch fixes that by looking for encoded words when
splitting tokens with whitespace.

Missing trailing whitespace around encoded word now register a defect
instead.

Original patch suggestion by David R. Murray on bpo-21315.
parent 142566c0
...@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') ...@@ -96,6 +96,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
def quote_string(value): def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
rfc2047_matcher = re.compile(r'''
=\? # literal =?
[^?]* # charset
\? # literal ?
[qQbB] # literal 'q' or 'b', case insensitive
\? # literal ?
.*? # encoded word
\?= # literal ?=
''', re.VERBOSE | re.MULTILINE)
# #
# TokenList and its subclasses # TokenList and its subclasses
# #
...@@ -1052,6 +1064,10 @@ def get_encoded_word(value): ...@@ -1052,6 +1064,10 @@ def get_encoded_word(value):
_validate_xtext(vtext) _validate_xtext(vtext)
ew.append(vtext) ew.append(vtext)
text = ''.join(remainder) text = ''.join(remainder)
# Encoded words should be followed by a WS
if value and value[0] not in WSP:
ew.defects.append(errors.InvalidHeaderDefect(
"missing trailing whitespace after encoded-word"))
return ew, value return ew, value
def get_unstructured(value): def get_unstructured(value):
...@@ -1104,6 +1120,11 @@ def get_unstructured(value): ...@@ -1104,6 +1120,11 @@ def get_unstructured(value):
unstructured.append(token) unstructured.append(token)
continue continue
tok, *remainder = _wsp_splitter(value, 1) tok, *remainder = _wsp_splitter(value, 1)
# Split in the middle of an atom if there is a rfc2047 encoded word
# which does not have WSP on both sides. The defect will be registered
# the next time through the loop.
if rfc2047_matcher.search(tok):
tok, *remainder = value.partition('=?')
vtext = ValueTerminal(tok, 'vtext') vtext = ValueTerminal(tok, 'vtext')
_validate_xtext(vtext) _validate_xtext(vtext)
unstructured.append(vtext) unstructured.append(vtext)
......
...@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase): ...@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
'=?us-ascii?q?first?==?utf-8?q?second?=', '=?us-ascii?q?first?==?utf-8?q?second?=',
'first', 'first',
'first', 'first',
[], [errors.InvalidHeaderDefect],
'=?utf-8?q?second?=') '=?utf-8?q?second?=')
def test_get_encoded_word_sets_extra_attributes(self): def test_get_encoded_word_sets_extra_attributes(self):
...@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase): ...@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
'=?utf-8?q?foo?==?utf-8?q?bar?=', '=?utf-8?q?foo?==?utf-8?q?bar?=',
'foobar', 'foobar',
'foobar', 'foobar',
[errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'')
def test_get_unstructured_ew_without_leading_whitespace(self):
self._test_get_x(
self._get_unst,
'nowhitespace=?utf-8?q?somevalue?=',
'nowhitespacesomevalue',
'nowhitespacesomevalue',
[errors.InvalidHeaderDefect],
'')
def test_get_unstructured_ew_without_trailing_whitespace(self):
self._test_get_x(
self._get_unst,
'=?utf-8?q?somevalue?=nowhitespace',
'somevaluenowhitespace',
'somevaluenowhitespace',
[errors.InvalidHeaderDefect], [errors.InvalidHeaderDefect],
'') '')
...@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase): ...@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase):
'"=?utf-8?Q?not_really_valid?="', '"=?utf-8?Q?not_really_valid?="',
'"not really valid"', '"not really valid"',
'not really valid', 'not really valid',
[errors.InvalidHeaderDefect], [errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'') '')
# get_comment # get_comment
......
...@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase): ...@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
'rfc2047_atom_in_quoted_string_is_decoded': 'rfc2047_atom_in_quoted_string_is_decoded':
('"=?utf-8?q?=C3=89ric?=" <foo@example.com>', ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
[errors.InvalidHeaderDefect], [errors.InvalidHeaderDefect,
errors.InvalidHeaderDefect],
'Éric <foo@example.com>', 'Éric <foo@example.com>',
'Éric', 'Éric',
'foo@example.com', 'foo@example.com',
......
Email headers containing RFC2047 encoded words are parsed despite the missing
whitespace, and a defect registered. Also missing trailing whitespace after
encoded words is now registered as a defect.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment