Commit 224ef3ec authored by R David Murray's avatar R David Murray

#24211: Add RFC6532 support to the email library.

This could use more edge case tests, but the basic functionality is tested.
(Note that this changeset does not add tailored support for the RFC 6532
message/global MIME type, but the email package generic facilities will handle

Reviewed by Maciej Szulik.
parent c1ecef78
......@@ -378,6 +378,14 @@ added matters. To illustrate::
In addition to the settable attributes listed above that apply to all
policies, this policy adds the following additional attributes:
.. attribute:: utf8
If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
headers by encoding them as "encoded words". If ``True``, follow
:rfc:`6532` and use ``utf-8`` encoding for headers. Messages
formatted in this way may be passed to SMTP servers that support
the ``SMTPUTF8`` extension (:rfc:`6531`).
.. attribute:: refold_source
If the value for a header in the ``Message`` object originated from a
......@@ -356,6 +356,12 @@ email
header (``None`` if there is no such header). (Contributed by Abhilash Raj
in :issue:`21083`.)
* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set
``True`` to encode email headers using the utf8 charset instead of using
encoded words. This allows ``Messages`` to be formatted according to
:rfc:`6532` and used with an SMTP server that supports the :rfc:`6531`
``SMTPUTF8`` extension. (Contributed by R. David Murray in :issue:`24211`.)
......@@ -320,17 +320,18 @@ class TokenList(list):
return ''.join(res)
def _fold(self, folded):
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in
tstr = str(part)
tlen = len(tstr)
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
charset = 'unknown-8bit'
# XXX: this should be a policy setting
# XXX: this should be a policy setting when utf8 is False.
charset = 'utf-8'
tstr = part.cte_encode(charset, folded.policy)
tlen = len(tstr)
......@@ -394,11 +395,12 @@ class UnstructuredTokenList(TokenList):
def _fold(self, folded):
last_ew = None
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in
tstr = str(part)
is_ew = False
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
......@@ -475,12 +477,13 @@ class Phrase(TokenList):
# comment that becomes a barrier across which we can't compose encoded
# words.
last_ew = None
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in
tstr = str(part)
tlen = len(tstr)
has_ew = False
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
......@@ -35,6 +35,13 @@ class EmailPolicy(Policy):
In addition to the settable attributes listed above that apply to
all Policies, this policy adds the following additional attributes:
utf8 -- if False (the default) message headers will be
serialized as ASCII, using encoded words to encode
any non-ASCII characters in the source strings. If
True, the message headers will be serialized using
utf8 and will not contain encoded words (see RFC
6532 for more on this serialization format).
refold_source -- if the value for a header in the Message object
came from the parsing of some source, this attribute
indicates whether or not a generator should refold
......@@ -72,6 +79,7 @@ class EmailPolicy(Policy):
utf8 = False
refold_source = 'long'
header_factory = HeaderRegistry()
content_manager = raw_data_manager
......@@ -175,9 +183,13 @@ class EmailPolicy(Policy):
refold_header setting, since there is no way to know whether the binary
data consists of single byte characters or multibyte characters.
If utf8 is true, headers are encoded to utf8, otherwise to ascii with
non-ASCII unicode rendered as encoded words.
folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
return folded.encode('ascii', 'surrogateescape')
charset = 'utf8' if self.utf8 else 'ascii'
return folded.encode(charset, 'surrogateescape')
def _fold(self, name, value, refold_binary=False):
if hasattr(value, 'name'):
......@@ -199,3 +211,4 @@ del default.header_factory
strict = default.clone(raise_on_defect=True)
SMTP = default.clone(linesep='\r\n')
HTTP = default.clone(linesep='\r\n', max_line_length=None)
SMTPUTF8 = SMTP.clone(utf8=True)
......@@ -2,6 +2,7 @@ import io
import textwrap
import unittest
from email import message_from_string, message_from_bytes
from email.message import EmailMessage
from email.generator import Generator, BytesGenerator
from email import policy
from test.test_email import TestEmailBase, parameterize
......@@ -194,6 +195,27 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
self.assertEqual(s.getvalue(), expected)
def test_smtputf8_policy(self):
msg = EmailMessage()
msg['From'] = "Páolo <fő>"
msg['To'] = 'Dinsdale'
msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
msg.set_content("oh là là, know what I mean, know what I mean?")
expected = textwrap.dedent("""\
From: Páolo <fő>
To: Dinsdale
Subject: Nudge nudge, wink, wink \u1F609
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit
MIME-Version: 1.0
oh là là, know what I mean, know what I mean?
""").encode('utf-8').replace(b'\n', b'\r\n')
s = io.BytesIO()
g = BytesGenerator(s, policy=policy.SMTPUTF8)
self.assertEqual(s.getvalue(), expected)
if __name__ == '__main__':
......@@ -27,6 +27,7 @@ class PolicyAPITests(unittest.TestCase):
# If any of these defaults change, the docs must be updated.
policy_defaults = compat32_defaults.copy()
'utf8': False,
'raise_on_defect': False,
'header_factory': email.policy.EmailPolicy.header_factory,
'refold_source': 'long',
......@@ -42,6 +43,9 @@ class PolicyAPITests(unittest.TestCase):
email.policy.default: make_defaults(policy_defaults, {}),
email.policy.SMTP: make_defaults(policy_defaults,
{'linesep': '\r\n'}),
email.policy.SMTPUTF8: make_defaults(policy_defaults,
{'linesep': '\r\n',
'utf8': True}),
email.policy.HTTP: make_defaults(policy_defaults,
{'linesep': '\r\n',
'max_line_length': None}),
......@@ -47,6 +47,9 @@ Core and Builtins
- Issue #24211: The email library now supports RFC 6532: it can generate
headers using utf-8 instead of encoded words.
- Issue #16314: Added support for the LZMA compression in distutils.
- Issue #21804: poplib now supports RFC 6856 (UTF8).
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment