Commit 85d5c18c authored by R. David Murray's avatar R. David Murray Committed by GitHub

bpo-27240 Rewrite the email header folding algorithm. (#3488)

The original algorithm tried to delegate the folding to the tokens so
that those tokens whose folding rules differed could specify the
differences.  However, this resulted in a lot of duplicated code because
most of the rules were the same.

The new algorithm moves all folding logic into a set of functions
external to the token classes, but puts the information about which
tokens can be folded in which ways on the tokens...with the exception of
mime-parameters, which are a special case (which was not even
implemented in the old folder).

This algorithm can still probably be improved and hopefully simplified
somewhat.

Note that some of the test expectations are changed.  I believe the
changes are toward more desirable and consistent behavior: in general
when (re) folding a line the canonical version of the tokens is
generated, rather than preserving errors or extra whitespace.
parent 29ba6880
This diff is collapsed.
...@@ -245,13 +245,16 @@ class BaseHeader(str): ...@@ -245,13 +245,16 @@ class BaseHeader(str):
the header name and the ': ' separator. the header name and the ': ' separator.
""" """
# At some point we need to only put fws here if it was in the source. # At some point we need to put fws here iif it was in the source.
header = parser.Header([ header = parser.Header([
parser.HeaderLabel([ parser.HeaderLabel([
parser.ValueTerminal(self.name, 'header-name'), parser.ValueTerminal(self.name, 'header-name'),
parser.ValueTerminal(':', 'header-sep')]), parser.ValueTerminal(':', 'header-sep')]),
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), ])
self._parse_tree]) if self._parse_tree:
header.append(
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]))
header.append(self._parse_tree)
return header.fold(policy=policy) return header.fold(policy=policy)
......
...@@ -14,18 +14,7 @@ class TestTokens(TestEmailBase): ...@@ -14,18 +14,7 @@ class TestTokens(TestEmailBase):
self.assertEqual(x, ' \t') self.assertEqual(x, ' \t')
self.assertEqual(str(x), '') self.assertEqual(str(x), '')
self.assertEqual(x.value, '') self.assertEqual(x.value, '')
self.assertEqual(x.encoded, ' \t') self.assertEqual(x.token_type, 'fws')
# UnstructuredTokenList
def test_undecodable_bytes_error_preserved(self):
badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape')
unst = parser.get_unstructured(badstr)
self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect])
parts = list(unst.parts)
self.assertDefectsEqual(parts[0].all_defects, [])
self.assertDefectsEqual(parts[1].all_defects, [])
self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
class TestParserMixin: class TestParserMixin:
...@@ -139,7 +128,6 @@ class TestParser(TestParserMixin, TestEmailBase): ...@@ -139,7 +128,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second', 'first second',
[], [],
'') '')
self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, 'jive') self.assertEqual(ew.lang, 'jive')
...@@ -150,7 +138,6 @@ class TestParser(TestParserMixin, TestEmailBase): ...@@ -150,7 +138,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second', 'first second',
[], [],
'') '')
self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii') self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, '') self.assertEqual(ew.lang, '')
...@@ -2700,27 +2687,36 @@ class TestFolding(TestEmailBase): ...@@ -2700,27 +2687,36 @@ class TestFolding(TestEmailBase):
# and with unicode tokens in the comments. Spaces inside the quotes # and with unicode tokens in the comments. Spaces inside the quotes
# currently don't do the right thing. # currently don't do the right thing.
def test_initial_whitespace_splitting(self): def test_split_at_whitespace_after_header_before_long_token(self):
body = parser.get_unstructured(' ' + 'x'*77) body = parser.get_unstructured(' ' + 'x'*77)
header = parser.Header([ header = parser.Header([
parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]), parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]),
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body]) parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body])
self._test(header, 'test: \n ' + 'x'*77 + '\n') self._test(header, 'test: \n ' + 'x'*77 + '\n')
def test_whitespace_splitting(self): def test_split_at_whitespace_before_long_token(self):
self._test(parser.get_unstructured('xxx ' + 'y'*77), self._test(parser.get_unstructured('xxx ' + 'y'*77),
'xxx \n ' + 'y'*77 + '\n') 'xxx \n ' + 'y'*77 + '\n')
def test_overlong_encodeable_is_wrapped(self):
first_token_with_whitespace = 'xxx '
chrome_leader = '=?utf-8?q?'
len_chrome = len(chrome_leader) + 2
len_non_y = len_chrome + len(first_token_with_whitespace)
self._test(parser.get_unstructured(first_token_with_whitespace +
'y'*80),
first_token_with_whitespace + chrome_leader +
'y'*(78-len_non_y) + '?=\n' +
' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n')
def test_long_filename_attachment(self): def test_long_filename_attachment(self):
folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"') self._test(parser.parse_content_disposition_header(
self.assertEqual( 'attachment; filename="TEST_TEST_TEST_TEST'
'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n', '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'),
folded "attachment;\n"
) " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST"
folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"') "_TEST_TEST_TEST_TEST_TEST;\n"
self.assertEqual( " filename*1*=_TEST_TES.txt\n",
'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n',
folded
) )
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -27,7 +27,6 @@ class TestGeneratorBase: ...@@ -27,7 +27,6 @@ class TestGeneratorBase:
None None
"""), """),
# From is wrapped because wrapped it fits in 40.
40: textwrap.dedent("""\ 40: textwrap.dedent("""\
To: whom_it_may_concern@example.com To: whom_it_may_concern@example.com
From: From:
...@@ -40,11 +39,11 @@ class TestGeneratorBase: ...@@ -40,11 +39,11 @@ class TestGeneratorBase:
None None
"""), """),
# Neither to nor from fit even if put on a new line,
# so we leave them sticking out on the first line.
20: textwrap.dedent("""\ 20: textwrap.dedent("""\
To: whom_it_may_concern@example.com To:
From: nobody_you_want_to_know@example.com whom_it_may_concern@example.com
From:
nobody_you_want_to_know@example.com
Subject: We the Subject: We the
willing led by the willing led by the
unknowing are doing unknowing are doing
...@@ -169,6 +168,53 @@ class TestGeneratorBase: ...@@ -169,6 +168,53 @@ class TestGeneratorBase:
g.flatten(msg) g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0])) self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0]))
def test_rfc2231_wrapping(self):
# This is pretty much just to make sure we don't have an infinite
# loop; I don't expect anyone to hit this in the field.
msg = self.msgmaker(self.typ(textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename="afilenamelongenoghtowraphere"
None
""")))
expected = textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename*0*=us-ascii''afilename;
filename*1*=longenoghtowraphere
None
""")
s = self.ioclass()
g = self.genclass(s, policy=self.policy.clone(max_line_length=33))
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))
def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self):
# This is just to make sure we don't have an infinite loop; I don't
# expect anyone to hit this in the field, so I'm not bothering to make
# the result optimal (the encoding isn't needed).
msg = self.msgmaker(self.typ(textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename="afilenamelongenoghtowraphere"
None
""")))
expected = textwrap.dedent("""\
To: nobody
Content-Disposition:
attachment;
filename*0*=us-ascii''afilenamelongenoghtowraphere
None
""")
s = self.ioclass()
g = self.genclass(s, policy=self.policy.clone(max_line_length=20))
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))
class TestGenerator(TestGeneratorBase, TestEmailBase): class TestGenerator(TestGeneratorBase, TestEmailBase):
......
This diff is collapsed.
The header folding algorithm for the new email policies has been rewritten,
which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231
folding is now done correctly.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment