Commit f6caeba0 authored by Barry Warsaw's avatar Barry Warsaw

Anthony Baxter's patch for non-strict parsing. This adds a `strict'

argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.
parent a0c8b9d4
...@@ -14,10 +14,9 @@ from email import Message ...@@ -14,10 +14,9 @@ from email import Message
EMPTYSTRING = '' EMPTYSTRING = ''
NL = '\n' NL = '\n'
class Parser: class Parser:
def __init__(self, _class=Message.Message): def __init__(self, _class=Message.Message, strict=1):
"""Parser of RFC 2822 and MIME email messages. """Parser of RFC 2822 and MIME email messages.
Creates an in-memory object tree representing the email message, which Creates an in-memory object tree representing the email message, which
...@@ -32,17 +31,25 @@ class Parser: ...@@ -32,17 +31,25 @@ class Parser:
_class is the class to instantiate for new message objects when they _class is the class to instantiate for new message objects when they
must be created. This class must have a constructor that can take must be created. This class must have a constructor that can take
zero arguments. Default is Message.Message. zero arguments. Default is Message.Message.
Optional strict tells the parser to be strictly RFC compliant or to be
more forgiving in parsing of ill-formatted MIME documents. When
non-strict mode is used, the parser will try to make up for missing or
erroneous boundaries and other peculiarities seen in the wild.
Defaults to strict parsing.
""" """
self._class = _class self._class = _class
self._strict = strict
def parse(self, fp): def parse(self, fp, headersonly=0):
root = self._class() root = self._class()
self._parseheaders(root, fp) self._parseheaders(root, fp)
self._parsebody(root, fp) if not headersonly:
self._parsebody(root, fp)
return root return root
def parsestr(self, text): def parsestr(self, text, headersonly=0):
return self.parse(StringIO(text)) return self.parse(StringIO(text), headersonly=headersonly)
def _parseheaders(self, container, fp): def _parseheaders(self, container, fp):
# Parse the headers, returning a list of header/value pairs. None as # Parse the headers, returning a list of header/value pairs. None as
...@@ -67,9 +74,13 @@ class Parser: ...@@ -67,9 +74,13 @@ class Parser:
if lineno == 1: if lineno == 1:
container.set_unixfrom(line) container.set_unixfrom(line)
continue continue
else: elif self._strict:
raise Errors.HeaderParseError( raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header') 'Unix-from in headers after first rfc822 header')
else:
# ignore the wierdly placed From_ line
# XXX: maybe set unixfrom anyway? or only if not already?
continue
# Header continuation line # Header continuation line
if line[0] in ' \t': if line[0] in ' \t':
if not lastheader: if not lastheader:
...@@ -84,8 +95,15 @@ class Parser: ...@@ -84,8 +95,15 @@ class Parser:
# instead of raising the exception). # instead of raising the exception).
i = line.find(':') i = line.find(':')
if i < 0: if i < 0:
raise Errors.HeaderParseError( if self._strict:
'Not a header, not a continuation') raise Errors.HeaderParseError(
"Not a header, not a continuation: ``%s''"%line)
elif lineno == 1 and line.startswith('--'):
# allow through duplicate boundary tags.
continue
else:
raise Errors.HeaderParseError(
"Not a header, not a continuation: ``%s''"%line)
if lastheader: if lastheader:
container[lastheader] = NL.join(lastvalue) container[lastheader] = NL.join(lastvalue)
lastheader = line[:i] lastheader = line[:i]
...@@ -122,31 +140,60 @@ class Parser: ...@@ -122,31 +140,60 @@ class Parser:
cre = re.compile('\r\n|\r|\n') cre = re.compile('\r\n|\r|\n')
mo = cre.search(payload, start) mo = cre.search(payload, start)
if mo: if mo:
start += len(mo.group(0)) * (1 + isdigest) start += len(mo.group(0))
# We create a compiled regexp first because we need to be able to # We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't # specify the start position, and the module function doesn't
# support this signature. :( # support this signature. :(
cre = re.compile('(?P<sep>\r\n|\r|\n)' + cre = re.compile('(?P<sep>\r\n|\r|\n)' +
re.escape(separator) + '--') re.escape(separator) + '--')
mo = cre.search(payload, start) mo = cre.search(payload, start)
if not mo: if mo:
terminator = mo.start()
linesep = mo.group('sep')
if mo.end() < len(payload):
# there's some post-MIME boundary epilogue
epilogue = payload[mo.end():]
elif self._strict:
raise Errors.BoundaryError( raise Errors.BoundaryError(
"Couldn't find terminating boundary: %s" % boundary) "Couldn't find terminating boundary: %s" % boundary)
terminator = mo.start() else:
linesep = mo.group('sep') # handle the case of no trailing boundary. I hate mail clients.
if mo.end() < len(payload): # check that it ends in a blank line
# there's some post-MIME boundary epilogue endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
epilogue = payload[mo.end():] mo = endre.search(payload)
if not mo:
raise Errors.BoundaryError(
"Couldn't find terminating boundary, and no "+
"trailing empty line")
else:
linesep = mo.group('sep')
terminator = len(payload)
# We split the textual payload on the boundary separator, which # We split the textual payload on the boundary separator, which
# includes the trailing newline. If the container is a # includes the trailing newline. If the container is a
# multipart/digest then the subparts are by default message/rfc822 # multipart/digest then the subparts are by default message/rfc822
# instead of text/plain. In that case, they'll have an extra # instead of text/plain. In that case, they'll have a optional
# newline before the headers to distinguish the message's headers # block of MIME headers, then an empty line followed by the
# from the subpart headers. # message headers.
separator += linesep * (1 + isdigest) separator += linesep
parts = payload[start:terminator].split(linesep + separator) parts = payload[start:terminator].split(linesep + separator)
for part in parts: for part in parts:
msgobj = self.parsestr(part) if isdigest:
if part[0] == linesep:
# There's no header block so create an empty message
# object as the container, and lop off the newline so
# we can parse the sub-subobject
msgobj = self._class()
part = part[1:]
else:
parthdrs, part = part.split(linesep+linesep, 1)
# msgobj in this case is the "message/rfc822" container
msgobj = self.parsestr(parthdrs, headersonly=1)
# while submsgobj is the message itself
submsgobj = self.parsestr(part)
msgobj.attach(submsgobj)
msgobj.set_default_type('message/rfc822')
else:
msgobj = self.parsestr(part)
container.preamble = preamble container.preamble = preamble
container.epilogue = epilogue container.epilogue = epilogue
container.attach(msgobj) container.attach(msgobj)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment