Commit 85d5c18c authored by R. David Murray's avatar R. David Murray Committed by GitHub

bpo-27240 Rewrite the email header folding algorithm. (#3488)

The original algorithm tried to delegate the folding to the tokens so
that those tokens whose folding rules differed could specify the
differences.  However, this resulted in a lot of duplicated code because
most of the rules were the same.

The new algorithm moves all folding logic into a set of functions
external to the token classes, but puts the information about which
tokens can be folded in which ways on the tokens...with the exception of
mime-parameters, which are a special case (which was not even
implemented in the old folder).

This algorithm can still probably be improved and hopefully simplified
somewhat.

Note that some of the test expectations are changed.  I believe the
changes are toward more desirable and consistent behavior: in general
when (re) folding a line the canonical version of the tokens is
generated, rather than preserving errors or extra whitespace.
parent 29ba6880
......@@ -96,90 +96,6 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
def quote_string(value):
return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
#
# Accumulator for header folding
#
class _Folded:
def __init__(self, maxlen, policy):
self.maxlen = maxlen
self.policy = policy
self.lastlen = 0
self.stickyspace = None
self.firstline = True
self.done = []
self.current = []
def newline(self):
self.done.extend(self.current)
self.done.append(self.policy.linesep)
self.current.clear()
self.lastlen = 0
def finalize(self):
if self.current:
self.newline()
def __str__(self):
return ''.join(self.done)
def append(self, stoken):
self.current.append(stoken)
def append_if_fits(self, token, stoken=None):
if stoken is None:
stoken = str(token)
l = len(stoken)
if self.stickyspace is not None:
stickyspace_len = len(self.stickyspace)
if self.lastlen + stickyspace_len + l <= self.maxlen:
self.current.append(self.stickyspace)
self.lastlen += stickyspace_len
self.current.append(stoken)
self.lastlen += l
self.stickyspace = None
self.firstline = False
return True
if token.has_fws:
ws = token.pop_leading_fws()
if ws is not None:
self.stickyspace += str(ws)
stickyspace_len += len(ws)
token._fold(self)
return True
if stickyspace_len and l + 1 <= self.maxlen:
margin = self.maxlen - l
if 0 < margin < stickyspace_len:
trim = stickyspace_len - margin
self.current.append(self.stickyspace[:trim])
self.stickyspace = self.stickyspace[trim:]
stickyspace_len = trim
self.newline()
self.current.append(self.stickyspace)
self.current.append(stoken)
self.lastlen = l + stickyspace_len
self.stickyspace = None
self.firstline = False
return True
if not self.firstline:
self.newline()
self.current.append(self.stickyspace)
self.current.append(stoken)
self.stickyspace = None
self.firstline = False
return True
if self.lastlen + l <= self.maxlen:
self.current.append(stoken)
self.lastlen += l
return True
if l < self.maxlen:
self.newline()
self.current.append(stoken)
self.lastlen = l
return True
return False
#
# TokenList and its subclasses
#
......@@ -187,6 +103,8 @@ class _Folded:
class TokenList(list):
token_type = None
syntactic_break = True
ew_combine_allowed = True
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
......@@ -207,84 +125,13 @@ class TokenList(list):
def all_defects(self):
return sum((x.all_defects for x in self), self.defects)
#
# Folding API
#
# parts():
#
# return a list of objects that constitute the "higher level syntactic
# objects" specified by the RFC as the best places to fold a header line.
# The returned objects must include leading folding white space, even if
# this means mutating the underlying parse tree of the object. Each object
# is only responsible for returning *its* parts, and should not drill down
# to any lower level except as required to meet the leading folding white
# space constraint.
#
# _fold(folded):
#
# folded: the result accumulator. This is an instance of _Folded.
# (XXX: I haven't finished factoring this out yet, the folding code
# pretty much uses this as a state object.) When the folded.current
# contains as much text as will fit, the _fold method should call
# folded.newline.
# folded.lastlen: the current length of the test stored in folded.current.
# folded.maxlen: The maximum number of characters that may appear on a
# folded line. Differs from the policy setting in that "no limit" is
# represented by +inf, which means it can be used in the trivially
# logical fashion in comparisons.
#
# Currently no subclasses implement parts, and I think this will remain
# true. A subclass only needs to implement _fold when the generic version
# isn't sufficient. _fold will need to be implemented primarily when it is
# possible for encoded words to appear in the specialized token-list, since
# there is no generic algorithm that can know where exactly the encoded
# words are allowed. A _fold implementation is responsible for filling
# lines in the same general way that the top level _fold does. It may, and
# should, call the _fold method of sub-objects in a similar fashion to that
# of the top level _fold.
#
# XXX: I'm hoping it will be possible to factor the existing code further
# to reduce redundancy and make the logic clearer.
@property
def parts(self):
klass = self.__class__
this = []
for token in self:
if token.startswith_fws():
if this:
yield this[0] if len(this)==1 else klass(this)
this.clear()
end_ws = token.pop_trailing_ws()
this.append(token)
if end_ws:
yield klass(this)
this = [end_ws]
if this:
yield this[0] if len(this)==1 else klass(this)
def startswith_fws(self):
return self[0].startswith_fws()
def pop_leading_fws(self):
if self[0].token_type == 'fws':
return self.pop(0)
return self[0].pop_leading_fws()
def pop_trailing_ws(self):
if self[-1].token_type == 'cfws':
return self.pop(-1)
return self[-1].pop_trailing_ws()
@property
def has_fws(self):
for part in self:
if part.has_fws:
return True
return False
def has_leading_comment(self):
return self[0].has_leading_comment()
def as_ew_allowed(self):
"""True if all top level tokens of this part may be RFC2047 encoded."""
return all(part.as_ew_allowed for part in self)
@property
def comments(self):
......@@ -294,69 +141,13 @@ class TokenList(list):
return comments
def fold(self, *, policy):
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or float("+inf")
folded = _Folded(maxlen, policy)
self._fold(folded)
folded.finalize()
return str(folded)
def as_encoded_word(self, charset):
# This works only for things returned by 'parts', which include
# the leading fws, if any, that should be used.
res = []
ws = self.pop_leading_fws()
if ws:
res.append(ws)
trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
res.append(_ew.encode(str(self), charset))
res.append(trailer)
return ''.join(res)
def cte_encode(self, charset, policy):
res = []
for part in self:
res.append(part.cte_encode(charset, policy))
return ''.join(res)
def _fold(self, folded):
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in self.parts:
tstr = str(part)
tlen = len(tstr)
try:
str(part).encode(encoding)
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
charset = 'unknown-8bit'
else:
# XXX: this should be a policy setting when utf8 is False.
charset = 'utf-8'
tstr = part.cte_encode(charset, folded.policy)
tlen = len(tstr)
if folded.append_if_fits(part, tstr):
continue
# Peel off the leading whitespace if any and make it sticky, to
# avoid infinite recursion.
ws = part.pop_leading_fws()
if ws is not None:
folded.stickyspace = str(ws)
if folded.append_if_fits(part):
continue
if part.has_fws:
part._fold(folded)
continue
# There are no fold points in this one; it is too long for a single
# line and can't be split...we just have to put it on its own line.
folded.append(tstr)
folded.newline()
return _refold_parse_tree(self, policy=policy)
def pprint(self, indent=''):
print('\n'.join(self._pp(indent='')))
print(self.ppstr(indent=indent))
def ppstr(self, indent=''):
return '\n'.join(self._pp(indent=''))
return '\n'.join(self._pp(indent=indent))
def _pp(self, indent=''):
yield '{}{}/{}('.format(
......@@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList):
token_type = 'unstructured'
def _fold(self, folded):
last_ew = None
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in self.parts:
tstr = str(part)
is_ew = False
try:
str(part).encode(encoding)
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
charset = 'unknown-8bit'
else:
charset = 'utf-8'
if last_ew is not None:
# We've already done an EW, combine this one with it
# if there's room.
chunk = get_unstructured(
''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
oldlastlen = sum(len(x) for x in folded.current[:last_ew])
schunk = str(chunk)
lchunk = len(schunk)
if oldlastlen + lchunk <= folded.maxlen:
del folded.current[last_ew:]
folded.append(schunk)
folded.lastlen = oldlastlen + lchunk
continue
tstr = part.as_encoded_word(charset)
is_ew = True
if folded.append_if_fits(part, tstr):
if is_ew:
last_ew = len(folded.current) - 1
continue
if is_ew or last_ew:
# It's too big to fit on the line, but since we've
# got encoded words we can use encoded word folding.
part._fold_as_ew(folded)
continue
# Peel off the leading whitespace if any and make it sticky, to
# avoid infinite recursion.
ws = part.pop_leading_fws()
if ws is not None:
folded.stickyspace = str(ws)
if folded.append_if_fits(part):
continue
if part.has_fws:
part._fold(folded)
continue
# It can't be split...we just have to put it on its own line.
folded.append(tstr)
folded.newline()
last_ew = None
def cte_encode(self, charset, policy):
res = []
last_ew = None
for part in self:
spart = str(part)
try:
spart.encode('us-ascii')
res.append(spart)
except UnicodeEncodeError:
if last_ew is None:
res.append(part.cte_encode(charset, policy))
last_ew = len(res)
else:
tl = get_unstructured(''.join(res[last_ew:] + [spart]))
res.append(tl.as_encoded_word(charset))
return ''.join(res)
class Phrase(TokenList):
token_type = 'phrase'
def _fold(self, folded):
# As with Unstructured, we can have pure ASCII with or without
# surrogateescape encoded bytes, or we could have unicode. But this
# case is more complicated, since we have to deal with the various
# sub-token types and how they can be composed in the face of
# unicode-that-needs-CTE-encoding, and the fact that if a token a
# comment that becomes a barrier across which we can't compose encoded
# words.
last_ew = None
encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
for part in self.parts:
tstr = str(part)
tlen = len(tstr)
has_ew = False
try:
str(part).encode(encoding)
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
charset = 'unknown-8bit'
else:
charset = 'utf-8'
if last_ew is not None and not part.has_leading_comment():
# We've already done an EW, let's see if we can combine
# this one with it. The last_ew logic ensures that all we
# have at this point is atoms, no comments or quoted
# strings. So we can treat the text between the last
# encoded word and the content of this token as
# unstructured text, and things will work correctly. But
# we have to strip off any trailing comment on this token
# first, and if it is a quoted string we have to pull out
# the content (we're encoding it, so it no longer needs to
# be quoted).
if part[-1].token_type == 'cfws' and part.comments:
remainder = part.pop(-1)
else:
remainder = ''
for i, token in enumerate(part):
if token.token_type == 'bare-quoted-string':
part[i] = UnstructuredTokenList(token[:])
chunk = get_unstructured(
''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
schunk = str(chunk)
lchunk = len(schunk)
if last_ew + lchunk <= folded.maxlen:
del folded.current[last_ew:]
folded.append(schunk)
folded.lastlen = sum(len(x) for x in folded.current)
continue
tstr = part.as_encoded_word(charset)
tlen = len(tstr)
has_ew = True
if folded.append_if_fits(part, tstr):
if has_ew and not part.comments:
last_ew = len(folded.current) - 1
elif part.comments or part.token_type == 'quoted-string':
# If a comment is involved we can't combine EWs. And if a
# quoted string is involved, it's not worth the effort to
# try to combine them.
last_ew = None
continue
part._fold(folded)
def cte_encode(self, charset, policy):
res = []
last_ew = None
is_ew = False
for part in self:
spart = str(part)
try:
spart.encode('us-ascii')
res.append(spart)
except UnicodeEncodeError:
is_ew = True
if last_ew is None:
if not part.comments:
last_ew = len(res)
res.append(part.cte_encode(charset, policy))
elif not part.has_leading_comment():
if part[-1].token_type == 'cfws' and part.comments:
remainder = part.pop(-1)
else:
remainder = ''
for i, token in enumerate(part):
if token.token_type == 'bare-quoted-string':
part[i] = UnstructuredTokenList(token[:])
tl = get_unstructured(''.join(res[last_ew:] + [spart]))
res[last_ew:] = [tl.as_encoded_word(charset)]
if part.comments or (not is_ew and part.token_type == 'quoted-string'):
last_ew = None
return ''.join(res)
class Word(TokenList):
token_type = 'word'
......@@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList):
token_type = 'cfws'
def has_leading_comment(self):
return bool(self.comments)
class Atom(TokenList):
......@@ -579,6 +205,7 @@ class Atom(TokenList):
class Token(TokenList):
token_type = 'token'
encode_as_ew = False
class EncodedWord(TokenList):
......@@ -588,13 +215,6 @@ class EncodedWord(TokenList):
charset = None
lang = None
@property
def encoded(self):
if self.cte is not None:
return self.cte
_ew.encode(str(self), self.charset)
class QuotedString(TokenList):
......@@ -865,6 +485,7 @@ class InvalidMailbox(TokenList):
class Domain(TokenList):
token_type = 'domain'
as_ew_allowed = False
@property
def domain(self):
......@@ -879,11 +500,13 @@ class DotAtom(TokenList):
class DotAtomText(TokenList):
token_type = 'dot-atom-text'
as_ew_allowed = True
class AddrSpec(TokenList):
token_type = 'addr-spec'
as_ew_allowed = False
@property
def local_part(self):
......@@ -916,11 +539,13 @@ class AddrSpec(TokenList):
class ObsLocalPart(TokenList):
token_type = 'obs-local-part'
as_ew_allowed = False
class DisplayName(Phrase):
token_type = 'display-name'
ew_combine_allowed = False
@property
def display_name(self):
......@@ -960,6 +585,7 @@ class DisplayName(Phrase):
class LocalPart(TokenList):
token_type = 'local-part'
as_ew_allowed = False
@property
def value(self):
......@@ -995,6 +621,7 @@ class LocalPart(TokenList):
class DomainLiteral(TokenList):
token_type = 'domain-literal'
as_ew_allowed = False
@property
def domain(self):
......@@ -1081,6 +708,7 @@ class Value(TokenList):
class MimeParameters(TokenList):
token_type = 'mime-parameters'
syntactic_break = False
@property
def params(self):
......@@ -1165,6 +793,10 @@ class MimeParameters(TokenList):
class ParameterizedHeaderValue(TokenList):
# Set this false so that the value doesn't wind up on a new line even
# if it and the parameters would fit there but not on the first line.
syntactic_break = False
@property
def params(self):
for token in reversed(self):
......@@ -1172,18 +804,11 @@ class ParameterizedHeaderValue(TokenList):
return token.params
return {}
@property
def parts(self):
if self and self[-1].token_type == 'mime-parameters':
# We don't want to start a new line if all of the params don't fit
# after the value, so unwrap the parameter list.
return TokenList(self[:-1] + self[-1])
return TokenList(self).parts
class ContentType(ParameterizedHeaderValue):
token_type = 'content-type'
as_ew_allowed = False
maintype = 'text'
subtype = 'plain'
......@@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue):
class ContentDisposition(ParameterizedHeaderValue):
token_type = 'content-disposition'
as_ew_allowed = False
content_disposition = None
class ContentTransferEncoding(TokenList):
token_type = 'content-transfer-encoding'
as_ew_allowed = False
cte = '7bit'
class HeaderLabel(TokenList):
token_type = 'header-label'
as_ew_allowed = False
class Header(TokenList):
token_type = 'header'
def _fold(self, folded):
folded.append(str(self.pop(0)))
folded.lastlen = len(folded.current[0])
# The first line of the header is different from all others: we don't
# want to start a new object on a new line if it has any fold points in
# it that would allow part of it to be on the first header line.
# Further, if the first fold point would fit on the new line, we want
# to do that, but if it doesn't we want to put it on the first line.
# Folded supports this via the stickyspace attribute. If this
# attribute is not None, it does the special handling.
folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
rest = self.pop(0)
if self:
raise ValueError("Malformed Header token list")
rest._fold(folded)
#
# Terminal classes and instances
......@@ -1232,6 +844,10 @@ class Header(TokenList):
class Terminal(str):
as_ew_allowed = True
ew_combine_allowed = True
syntactic_break = True
def __new__(cls, value, token_type):
self = super().__new__(cls, value)
self.token_type = token_type
......@@ -1241,6 +857,9 @@ class Terminal(str):
def __repr__(self):
return "{}({})".format(self.__class__.__name__, super().__repr__())
def pprint(self):
print(self.__class__.__name__ + '/' + self.token_type)
@property
def all_defects(self):
return list(self.defects)
......@@ -1254,29 +873,14 @@ class Terminal(str):
'' if not self.defects else ' {}'.format(self.defects),
)]
def cte_encode(self, charset, policy):
value = str(self)
try:
value.encode('us-ascii')
return value
except UnicodeEncodeError:
return _ew.encode(value, charset)
def pop_trailing_ws(self):
# This terminates the recursion.
return None
def pop_leading_fws(self):
# This terminates the recursion.
return None
@property
def comments(self):
return []
def has_leading_comment(self):
return False
def __getnewargs__(self):
return(str(self), self.token_type)
......@@ -1290,8 +894,6 @@ class WhiteSpaceTerminal(Terminal):
def startswith_fws(self):
return True
has_fws = True
class ValueTerminal(Terminal):
......@@ -1302,11 +904,6 @@ class ValueTerminal(Terminal):
def startswith_fws(self):
return False
has_fws = False
def as_encoded_word(self, charset):
return _ew.encode(str(self), charset)
class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
......@@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
def value(self):
return ''
@property
def encoded(self):
return self[:]
def __str__(self):
return ''
has_fws = True
# XXX these need to become classes and used as instances so
# that a program can't change them in a parse tree and screw
......@@ -2751,7 +2342,7 @@ def get_parameter(value):
if value[0] != "'":
raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
"delimiter, but found {!r}".format(value))
appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
value = value[1:]
if value and value[0] != "'":
token, value = get_attrtext(value)
......@@ -2760,7 +2351,7 @@ def get_parameter(value):
if not value or value[0] != "'":
raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
"delimiter, but found {}".format(value))
appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
value = value[1:]
if remainder is not None:
# Treat the rest of value as bare quoted string content.
......@@ -2965,3 +2556,255 @@ def parse_content_transfer_encoding_header(value):
token, value = get_phrase(value)
cte_header.append(token)
return cte_header
#
# Header folding
#
# Header folding is complex, with lots of rules and corner cases. The
# following code does its best to obey the rules and handle the corner
# cases, but you can be sure there are few bugs:)
#
# This folder generally canonicalizes as it goes, preferring the stringified
# version of each token. The tokens contain information that supports the
# folder, including which tokens can be encoded in which ways.
#
# Folded text is accumulated in a simple list of strings ('lines'), each
# one of which should be less than policy.max_line_length ('maxlen').
#
def _steal_trailing_WSP_if_exists(lines):
wsp = ''
if lines and lines[-1] and lines[-1][-1] in WSP:
wsp = lines[-1][-1]
lines[-1] = lines[-1][:-1]
return wsp
def _refold_parse_tree(parse_tree, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.
"""
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or float("+inf")
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
wrap_as_ew_blocked = 0
want_encoding = False
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
parts = list(parse_tree)
while parts:
part = parts.pop(0)
if part is end_ew_not_allowed:
wrap_as_ew_blocked -= 1
continue
tstr = str(part)
try:
tstr.encode(encoding)
charset = encoding
except UnicodeEncodeError:
if any(isinstance(x, errors.UndecodableBytesDefect)
for x in part.all_defects):
charset = 'unknown-8bit'
else:
# If policy.utf8 is false this should really be taken from a
# 'charset' property on the policy.
charset = 'utf-8'
want_encoding = True
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
continue
if want_encoding and not wrap_as_ew_blocked:
if not part.as_ew_allowed:
want_encoding = False
last_ew = None
if part.syntactic_break:
encoded_part = part.fold(policy=policy)[:-1] # strip nl
if policy.linesep not in encoded_part:
# It fits on a single line
if len(encoded_part) > maxlen - len(lines[-1]):
# But not on this one, so start a new one.
newline = _steal_trailing_WSP_if_exists(lines)
# XXX what if encoded_part has no leading FWS?
lines.append(newline)
lines[-1] += encoded_part
continue
# Either this is not a major syntactic break, so we don't
# want it on a line by itself even if it fits, or it
# doesn't fit on a line by itself. Either way, fall through
# to unpacking the subparts and wrapping them.
if not hasattr(part, 'encode'):
# It's not a Terminal, do each piece individually.
parts = list(part) + parts
else:
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
want_encoding = False
continue
if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
continue
# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
continue
if not hasattr(part, 'encode'):
# It's not a terminal, try folding the subparts.
newparts = list(part)
if not part.as_ew_allowed:
wrap_as_ew_blocked += 1
newparts.append(end_ew_not_allowed)
parts = newparts + parts
continue
if part.as_ew_allowed and not wrap_as_ew_blocked:
# It doesn't need CTE encoding, but encode it anyway so we can
# wrap it.
parts.insert(0, part)
want_encoding = True
continue
# We can't figure out how to wrap, it, so give up.
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
return policy.linesep.join(lines) + policy.linesep
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.
If there is already an encoded word in the last line of lines (indicated by
a non-None value for last_ew) and ew_combine_allowed is true, decode the
existing ew, combine it with to_encode, and re-encode. Otherwise, encode
to_encode. In either case, split to_encode as necessary so that the
encoded segments fit within maxlen.
"""
if last_ew is not None and ew_combine_allowed:
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
if to_encode[0] in WSP:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
to_encode = to_encode[1:]
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp
trailing_wsp = ''
if to_encode[-1] in WSP:
# Likewise for the trailing space.
trailing_wsp = to_encode[-1]
to_encode = to_encode[:-1]
new_last_ew = len(lines[-1]) if last_ew is None else last_ew
while to_encode:
remaining_space = maxlen - len(lines[-1])
# The RFC2047 chrome takes up 7 characters plus the length
# of the charset name.
encode_as = 'utf-8' if charset == 'us-ascii' else charset
text_space = remaining_space - len(encode_as) - 7
if text_space <= 0:
lines.append(' ')
# XXX We'll get an infinite loop here if maxlen is <= 7
continue
first_part = to_encode[:text_space]
ew = _ew.encode(first_part, charset=encode_as)
excess = len(ew) - remaining_space
if excess > 0:
# encode always chooses the shortest encoding, so this
# is guaranteed to fit at this point.
first_part = first_part[:-excess]
ew = _ew.encode(first_part)
lines[-1] += ew
to_encode = to_encode[len(first_part):]
if to_encode:
lines.append(' ')
new_last_ew = len(lines[-1])
lines[-1] += trailing_wsp
return new_last_ew if ew_combine_allowed else None
def _fold_mime_parameters(part, lines, maxlen, encoding):
"""Fold TokenList 'part' into the 'lines' list as mime parameters.
Using the decoded list of parameters and values, format them according to
the RFC rules, including using RFC2231 encoding if the value cannot be
expressed in 'encoding' and/or the paramter+value is too long to fit within
'maxlen'.
"""
# Special case for RFC2231 encoding: start from decoded values and use
# RFC2231 encoding iff needed.
#
# Note that the 1 and 2s being added to the length calculations are
# accounting for the possibly-needed spaces and semicolons we'll be adding.
#
for name, value in part.params:
# XXX What if this ';' puts us over maxlen the first time through the
# loop? We should split the header value onto a newline in that case,
# but to do that we need to recognize the need earlier or reparse the
# header, so I'm going to ignore that bug for now. It'll only put us
# one character over.
if not lines[-1].rstrip().endswith(';'):
lines[-1] += ';'
charset = encoding
error_handler = 'strict'
try:
value.encode(encoding)
encoding_required = False
except UnicodeEncodeError:
encoding_required = True
if utils._has_surrogates(value):
charset = 'unknown-8bit'
error_handler = 'surrogateescape'
else:
charset = 'utf-8'
if encoding_required:
encoded_value = urllib.parse.quote(
value, safe='', errors=error_handler)
tstr = "{}*={}''{}".format(name, charset, encoded_value)
else:
tstr = '{}={}'.format(name, quote_string(value))
if len(lines[-1]) + len(tstr) + 1 < maxlen:
lines[-1] = lines[-1] + ' ' + tstr
continue
elif len(tstr) + 2 <= maxlen:
lines.append(' ' + tstr)
continue
# We need multiple sections. We are allowed to mix encoded and
# non-encoded sections, but we aren't going to. We'll encode them all.
section = 0
extra_chrome = charset + "''"
while value:
chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
if maxlen <= chrome_len + 3:
# We need room for the leading blank, the trailing semicolon,
# and at least one character of the value. If we don't
# have that, we'd be stuck, so in that case fall back to
# the RFC standard width.
maxlen = 78
splitpoint = maxchars = maxlen - chrome_len - 2
while True:
partial = value[:splitpoint]
encoded_value = urllib.parse.quote(
partial, safe='', errors=error_handler)
if len(encoded_value) <= maxchars:
break
splitpoint -= 1
lines.append(" {}*{}*={}{}".format(
name, section, extra_chrome, encoded_value))
extra_chrome = ''
section += 1
value = value[splitpoint:]
if value:
lines[-1] += ';'
......@@ -245,13 +245,16 @@ class BaseHeader(str):
the header name and the ': ' separator.
"""
# At some point we need to only put fws here if it was in the source.
# At some point we need to put fws here iif it was in the source.
header = parser.Header([
parser.HeaderLabel([
parser.ValueTerminal(self.name, 'header-name'),
parser.ValueTerminal(':', 'header-sep')]),
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]),
self._parse_tree])
])
if self._parse_tree:
header.append(
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]))
header.append(self._parse_tree)
return header.fold(policy=policy)
......
......@@ -14,18 +14,7 @@ class TestTokens(TestEmailBase):
self.assertEqual(x, ' \t')
self.assertEqual(str(x), '')
self.assertEqual(x.value, '')
self.assertEqual(x.encoded, ' \t')
# UnstructuredTokenList
def test_undecodable_bytes_error_preserved(self):
badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape')
unst = parser.get_unstructured(badstr)
self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect])
parts = list(unst.parts)
self.assertDefectsEqual(parts[0].all_defects, [])
self.assertDefectsEqual(parts[1].all_defects, [])
self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
self.assertEqual(x.token_type, 'fws')
class TestParserMixin:
......@@ -139,7 +128,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second',
[],
'')
self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, 'jive')
......@@ -150,7 +138,6 @@ class TestParser(TestParserMixin, TestEmailBase):
'first second',
[],
'')
self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=')
self.assertEqual(ew.charset, 'us-ascii')
self.assertEqual(ew.lang, '')
......@@ -2700,28 +2687,37 @@ class TestFolding(TestEmailBase):
# and with unicode tokens in the comments. Spaces inside the quotes
# currently don't do the right thing.
def test_initial_whitespace_splitting(self):
def test_split_at_whitespace_after_header_before_long_token(self):
body = parser.get_unstructured(' ' + 'x'*77)
header = parser.Header([
parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]),
parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body])
self._test(header, 'test: \n ' + 'x'*77 + '\n')
def test_whitespace_splitting(self):
def test_split_at_whitespace_before_long_token(self):
self._test(parser.get_unstructured('xxx ' + 'y'*77),
'xxx \n ' + 'y'*77 + '\n')
def test_overlong_encodeable_is_wrapped(self):
first_token_with_whitespace = 'xxx '
chrome_leader = '=?utf-8?q?'
len_chrome = len(chrome_leader) + 2
len_non_y = len_chrome + len(first_token_with_whitespace)
self._test(parser.get_unstructured(first_token_with_whitespace +
'y'*80),
first_token_with_whitespace + chrome_leader +
'y'*(78-len_non_y) + '?=\n' +
' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n')
def test_long_filename_attachment(self):
folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"')
self.assertEqual(
'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n',
folded
)
folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"')
self.assertEqual(
'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n',
folded
)
self._test(parser.parse_content_disposition_header(
'attachment; filename="TEST_TEST_TEST_TEST'
'_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'),
"attachment;\n"
" filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST"
"_TEST_TEST_TEST_TEST_TEST;\n"
" filename*1*=_TEST_TES.txt\n",
)
if __name__ == '__main__':
unittest.main()
......@@ -27,7 +27,6 @@ class TestGeneratorBase:
None
"""),
# From is wrapped because wrapped it fits in 40.
40: textwrap.dedent("""\
To: whom_it_may_concern@example.com
From:
......@@ -40,11 +39,11 @@ class TestGeneratorBase:
None
"""),
# Neither to nor from fit even if put on a new line,
# so we leave them sticking out on the first line.
20: textwrap.dedent("""\
To: whom_it_may_concern@example.com
From: nobody_you_want_to_know@example.com
To:
whom_it_may_concern@example.com
From:
nobody_you_want_to_know@example.com
Subject: We the
willing led by the
unknowing are doing
......@@ -169,6 +168,53 @@ class TestGeneratorBase:
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0]))
def test_rfc2231_wrapping(self):
# This is pretty much just to make sure we don't have an infinite
# loop; I don't expect anyone to hit this in the field.
msg = self.msgmaker(self.typ(textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename="afilenamelongenoghtowraphere"
None
""")))
expected = textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename*0*=us-ascii''afilename;
filename*1*=longenoghtowraphere
None
""")
s = self.ioclass()
g = self.genclass(s, policy=self.policy.clone(max_line_length=33))
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))
def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self):
# This is just to make sure we don't have an infinite loop; I don't
# expect anyone to hit this in the field, so I'm not bothering to make
# the result optimal (the encoding isn't needed).
msg = self.msgmaker(self.typ(textwrap.dedent("""\
To: nobody
Content-Disposition: attachment;
filename="afilenamelongenoghtowraphere"
None
""")))
expected = textwrap.dedent("""\
To: nobody
Content-Disposition:
attachment;
filename*0*=us-ascii''afilenamelongenoghtowraphere
None
""")
s = self.ioclass()
g = self.genclass(s, policy=self.policy.clone(max_line_length=20))
g.flatten(msg)
self.assertEqual(s.getvalue(), self.typ(expected))
class TestGenerator(TestGeneratorBase, TestEmailBase):
......
......@@ -229,14 +229,14 @@ class TestContentTypeHeader(TestHeaderBase):
defects = args[1] if l>1 else []
decoded = args[2] if l>2 and args[2] is not DITTO else source
header = 'Content-Type:' + ' ' if source else ''
folded = args[3] if l>3 else header + source + '\n'
folded = args[3] if l>3 else header + decoded + '\n'
h = self.make_header('Content-Type', source)
self.assertEqual(h.content_type, content_type)
self.assertEqual(h.maintype, maintype)
self.assertEqual(h.subtype, subtype)
self.assertEqual(h.params, parmdict)
with self.assertRaises(TypeError):
h.params['abc'] = 'xyz' # params is read-only.
h.params['abc'] = 'xyz' # make sure params is read-only.
self.assertDefectsEqual(h.defects, defects)
self.assertEqual(h, decoded)
self.assertEqual(h.fold(policy=policy.default), folded)
......@@ -373,9 +373,10 @@ class TestContentTypeHeader(TestHeaderBase):
'text/plain; Charset="utf-8"'),
# Since this is pretty much the ur-mimeheader, we'll put all the tests
# that exercise the parameter parsing and formatting here.
#
# XXX: question: is minimal quoting preferred?
# that exercise the parameter parsing and formatting here. Note that
# when we refold we may canonicalize, so things like whitespace,
# quoting, and rfc2231 encoding may change from what was in the input
# header.
'unquoted_param_value': (
'text/plain; title=foo',
......@@ -384,7 +385,8 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'title': 'foo'},
[],
'text/plain; title="foo"'),
'text/plain; title="foo"',
),
'param_value_with_tspecials': (
'text/plain; title="(bar)foo blue"',
......@@ -415,7 +417,8 @@ class TestContentTypeHeader(TestHeaderBase):
'mixed',
{'boundary': 'CPIMSSMTPC06p5f3tG'},
[],
'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'),
'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"',
),
'spaces_around_semis': (
('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; '
......@@ -429,14 +432,31 @@ class TestContentTypeHeader(TestHeaderBase):
[],
('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
'x-mac-creator="474B4F4E"'),
# XXX: it could be that we will eventually prefer to fold starting
# from the decoded value, in which case these spaces and similar
# spaces in other tests will be wrong.
('Content-Type: image/jpeg; name="wibble.JPG" ; '
'x-mac-type="4A504547" ;\n'
('Content-Type: image/jpeg; name="wibble.JPG";'
' x-mac-type="4A504547";\n'
' x-mac-creator="474B4F4E"\n'),
),
'lots_of_mime_params': (
('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
'image/jpeg',
'image',
'jpeg',
{'name': 'wibble.JPG',
'x-mac-type': '4A504547',
'x-mac-creator': '474B4F4E',
'x-extrastuff': 'make it longer'},
[],
('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
# In this case the whole of the MimeParameters does *not* fit
# one one line, so we break at a lower syntactic level.
('Content-Type: image/jpeg; name="wibble.JPG";'
' x-mac-type="4A504547";\n'
' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'),
),
'semis_inside_quotes': (
'image/jpeg; name="Jim&amp;&amp;Jill"',
'image/jpeg',
......@@ -460,19 +480,25 @@ class TestContentTypeHeader(TestHeaderBase):
[],
r'image/jpeg; name="Jim \"Bob\" Jill"'),
# XXX: This test works except for the refolding of the header. I'll
# deal with that bug when I deal with the other folding bugs.
#'non_ascii_in_params': (
# ('foo\xa7/bar; b\xa7r=two; '
# 'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
# 'surrogateescape')),
# 'foo\uFFFD/bar',
# 'foo\uFFFD',
# 'bar',
# {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
# [errors.UndecodableBytesDefect]*3,
# 'foo�/bar; b�r="two"; baz="thr�e"',
# ),
'non_ascii_in_params': (
('foo\xa7/bar; b\xa7r=two; '
'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
'surrogateescape')),
'foo\uFFFD/bar',
'foo\uFFFD',
'bar',
{'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
[errors.UndecodableBytesDefect]*3,
'foo�/bar; b�r="two"; baz="thr�e"',
# XXX Two bugs here: the mime type is not allowed to be an encoded
# word, and we shouldn't be emitting surrogates in the parameter
# names. But I don't know what the behavior should be here, so I'm
# punting for now. In practice this is unlikely to be encountered
# since headers with binary in them only come from a binary source
# and are almost certain to be re-emitted without refolding.
'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n'
" baz*=unknown-8bit''thr%A7e\n",
),
# RFC 2231 parameter tests.
......@@ -494,19 +520,20 @@ class TestContentTypeHeader(TestHeaderBase):
[],
r'image/jpeg; bar="baz\"foobar\"baz"'),
# XXX: This test works except for the refolding of the header. I'll
# deal with that bug when I deal with the other folding bugs.
#'non_ascii_rfc2231_value': (
# ('text/plain; charset=us-ascii; '
# "title*=us-ascii'en'This%20is%20"
# 'not%20f\xa7n').encode('latin-1').decode('us-ascii',
# 'surrogateescape'),
# 'text/plain',
# 'text',
# 'plain',
# {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
# [errors.UndecodableBytesDefect],
# 'text/plain; charset="us-ascii"; title="This is not f�n"'),
'non_ascii_rfc2231_value': (
('text/plain; charset=us-ascii; '
"title*=us-ascii'en'This%20is%20"
'not%20f\xa7n').encode('latin-1').decode('us-ascii',
'surrogateescape'),
'text/plain',
'text',
'plain',
{'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
[errors.UndecodableBytesDefect],
'text/plain; charset="us-ascii"; title="This is not f�n"',
'Content-Type: text/plain; charset="us-ascii";\n'
" title*=unknown-8bit''This%20is%20not%20f%A7n\n",
),
'rfc2231_encoded_charset': (
'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii',
......@@ -529,8 +556,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is ***fun*** is it not.pdf'},
[],
'text/plain; name="This is ***fun*** is it not.pdf"',
('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n'
'\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'),
),
# Make sure we also handle it if there are spurious double quotes.
......@@ -545,9 +570,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it not.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it not.pdf"',
('Content-Type: text/plain;\t'
'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n'
'\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'),
),
'rfc2231_single_quote_inside_double_quotes': (
......@@ -562,9 +584,8 @@ class TestContentTypeHeader(TestHeaderBase):
[errors.InvalidHeaderDefect]*2,
('text/plain; charset="us-ascii"; '
'title="This is really ***fun*** isn\'t it!"'),
('Content-Type: text/plain; charset=us-ascii;\n'
'\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n'
'\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'),
('Content-Type: text/plain; charset="us-ascii";\n'
' title="This is really ***fun*** isn\'t it!"\n'),
),
'rfc2231_single_quote_in_value_with_charset_and_lang': (
......@@ -576,9 +597,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "Frank's Document"},
[errors.InvalidHeaderDefect]*2,
'application/x-foo; name="Frank\'s Document"',
('Content-Type: application/x-foo;\t'
'name*0*="us-ascii\'en-us\'Frank\'s";\n'
' name*1*=" Document"\n'),
),
'rfc2231_single_quote_in_non_encoded_value': (
......@@ -590,9 +608,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "us-ascii'en-us'Frank's Document"},
[],
'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"',
('Content-Type: application/x-foo;\t'
'name*0="us-ascii\'en-us\'Frank\'s";\n'
' name*1=" Document"\n'),
),
'rfc2231_no_language_or_charset': (
......@@ -615,12 +630,8 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it.pdf"',
('Content-Type: text/plain;\t'
'name*0*="\'\'This%20is%20even%20more%20";\n'
'\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
),
# XXX: see below...the first name line here should be *0 not *0*.
'rfc2231_partly_encoded': (
("text/plain;"
'\tname*0*="\'\'This%20is%20even%20more%20";'
......@@ -632,9 +643,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'This is even more ***fun*** is it.pdf'},
[errors.InvalidHeaderDefect]*2,
'text/plain; name="This is even more ***fun*** is it.pdf"',
('Content-Type: text/plain;\t'
'name*0*="\'\'This%20is%20even%20more%20";\n'
'\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
),
'rfc2231_partly_encoded_2': (
......@@ -647,10 +655,11 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'},
[errors.InvalidHeaderDefect],
'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"',
('Content-Type: text/plain;\t'
'name*0*="\'\'This%20is%20even%20more%20";\n'
'\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
('text/plain;'
' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'),
('Content-Type: text/plain;\n'
' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is'
' it.pdf"\n'),
),
'rfc2231_unknown_charset_treated_as_ascii': (
......@@ -669,9 +678,12 @@ class TestContentTypeHeader(TestHeaderBase):
'plain',
{'charset': 'utf-8\uFFFD\uFFFD\uFFFD'},
[errors.UndecodableBytesDefect],
'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'),
'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"',
"Content-Type: text/plain;"
" charset*=unknown-8bit''utf-8%F1%F2%F3\n",
),
'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': (
'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': (
"text/plain; charset*=ascii''utf-8%E2%80%9D",
'text/plain',
'text',
......@@ -679,9 +691,11 @@ class TestContentTypeHeader(TestHeaderBase):
{'charset': 'utf-8”'},
[errors.UndecodableBytesDefect],
'text/plain; charset="utf-8”"',
# XXX Should folding change the charset to utf8? Currently it just
# reproduces the original, which is arguably fine.
"Content-Type: text/plain;"
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
),
# XXX: if the above were *re*folded, it would get tagged as utf-8
# instead of ascii in the param, since it now contains non-ASCII.
'rfc2231_encoded_then_unencoded_segments': (
('application/x-foo;'
......@@ -694,9 +708,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'My Document For You'},
[errors.InvalidHeaderDefect],
'application/x-foo; name="My Document For You"',
('Content-Type: application/x-foo;\t'
'name*0*="us-ascii\'en-us\'My";\n'
'\tname*1=" Document";\tname*2=" For You"\n'),
),
# My reading of the RFC is that this is an invalid header. The RFC
......@@ -713,11 +724,6 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': 'My Document For You'},
[errors.InvalidHeaderDefect]*3,
'application/x-foo; name="My Document For You"',
("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t"
# XXX: the newline is in the wrong place, come back and fix
# this when the rest of tests pass.
'name*1*=" Document"\n;'
'\tname*2*=" For You"\n'),
),
# XXX: I would say this one should default to ascii/en for the
......@@ -730,8 +736,7 @@ class TestContentTypeHeader(TestHeaderBase):
# charset'lang'value pattern exactly *and* there is at least one
# encoded segment. Implementing that algorithm will require some
# refactoring, so I haven't done it (yet).
'rfc2231_qouted_unencoded_then_encoded_segments': (
'rfc2231_quoted_unencoded_then_encoded_segments': (
('application/x-foo;'
'\tname*0="us-ascii\'en-us\'My";'
'\tname*1*=" Document";'
......@@ -742,9 +747,25 @@ class TestContentTypeHeader(TestHeaderBase):
{'name': "us-ascii'en-us'My Document For You"},
[errors.InvalidHeaderDefect]*2,
'application/x-foo; name="us-ascii\'en-us\'My Document For You"',
('Content-Type: application/x-foo;\t'
'name*0="us-ascii\'en-us\'My";\n'
'\tname*1*=" Document";\tname*2*=" For You"\n'),
),
# Make sure our folding algorithm produces multiple sections correctly.
# We could mix encoded and non-encoded segments, but we don't, we just
# make them all encoded. It might be worth fixing that, since the
# sections can get used for wrapping ascii text.
'rfc2231_folded_segments_correctly_formatted': (
('application/x-foo;'
'\tname="' + "with spaces"*8 + '"'),
'application/x-foo',
'application',
'x-foo',
{'name': "with spaces"*8},
[],
'application/x-foo; name="' + "with spaces"*8 + '"',
"Content-Type: application/x-foo;\n"
" name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith"
"%20spaceswith;\n"
" name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n"
),
}
......@@ -827,8 +848,8 @@ class TestContentDisposition(TestHeaderBase):
[],
('attachment; filename="genome.jpeg"; '
'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'),
('Content-Disposition: attachment; filename=genome.jpeg;\n'
' modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'),
('Content-Disposition: attachment; filename="genome.jpeg";\n'
' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'),
),
'no_value': (
......@@ -873,7 +894,7 @@ class TestMIMEVersionHeader(TestHeaderBase):
if source:
source = ' ' + source
self.assertEqual(h.fold(policy=policy.default),
'MIME-Version:' + source + '\n')
'MIME-Version:' + source + '\n')
version_string_params = {
......@@ -1546,15 +1567,39 @@ class TestFolding(TestHeaderBase):
'singlewordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n')
'Subject: \n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
' =?utf-8?q?nsistin?=\n'
' =?utf-8?q?gofasin?=\n'
' =?utf-8?q?gleword?=\n'
' =?utf-8?q?thatwon?=\n'
' =?utf-8?q?tfit?=\n'
)
def test_fold_unstructured_with_two_overlong_words(self):
h = self.make_header('Subject', 'thisisaverylonglineconsistingofa'
'singlewordthatwontfit plusanotherverylongwordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n'
' plusanotherverylongwordthatwontfit\n')
'Subject: \n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
' =?utf-8?q?nsistin?=\n'
' =?utf-8?q?gofasin?=\n'
' =?utf-8?q?gleword?=\n'
' =?utf-8?q?thatwon?=\n'
' =?utf-8?q?tfit_pl?=\n'
' =?utf-8?q?usanoth?=\n'
' =?utf-8?q?erveryl?=\n'
' =?utf-8?q?ongword?=\n'
' =?utf-8?q?thatwon?=\n'
' =?utf-8?q?tfit?=\n'
)
# XXX Need test for when max_line_length is less than the chrome size.
def test_fold_unstructured_with_slightly_long_word(self):
h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen')
......@@ -1590,6 +1635,18 @@ class TestFolding(TestHeaderBase):
self.assertEqual(h.fold(policy=policy.default),
'Date: Sat, 02 Feb 2002 17:00:06 -0800\n')
def test_fold_overlong_words_using_RFC2047(self):
h = self.make_header(
'X-Report-Abuse',
'<https://www.mailitapp.com/report_abuse.php?'
'mid=xxx-xxx-xxxxxxxxxxxxxxxxxxxxxxxx==-xxx-xx-xx>')
self.assertEqual(
h.fold(policy=policy.default),
'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E'
'com/report=5F?=\n'
' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx'
'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n'
' =?utf-8?q?xx-xx=3E?=\n')
if __name__ == '__main__':
......
The header folding algorithm for the new email policies has been rewritten,
which also fixes bpo-30788, bpo-31831, and bpo-32182. In particular, RFC2231
folding is now done correctly.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment