Commit 0813d76c authored by Thomas Wouters's avatar Thomas Wouters

Merge in Anthony's new parser code, from the anthony-parser-branch:

> ----------------------------
> revision 1.20.4.4
> date: 2003/06/12 09:14:17;  author: anthonybaxter;  state: Exp;  lines: +13 -6
> preamble is None when missing, not ''.
> Handle a couple of bogus formatted messages - now parses my main testsuite.
> Handle message/external-body.
> ----------------------------
> revision 1.20.4.3
> date: 2003/06/12 07:16:40;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> epilogue-processing is now the same as the old parser - the newline at the
> end of the line with the --endboundary-- is included as part of the epilogue.
> Note that any whitespace after the boundary is _not_ part of the epilogue.
> ----------------------------
> revision 1.20.4.2
> date: 2003/06/12 06:39:09;  author: anthonybaxter;  state: Exp;  lines: +6 -4
> message/delivery-status fixed.
> HeaderParser fixed.
> ----------------------------
> revision 1.20.4.1
> date: 2003/06/12 06:08:56;  author: anthonybaxter;  state: Exp;  lines: +163 -129
> A work-in-progress snapshot of the new parser. A couple of known problems:
>
> - first (blank) line of MIME epilogues is being consumed
> - message/delivery-status isn't quite right
>
> It still needs a lot of cleanup, but right now it parses a whole lot of
> badness that the old parser failed on. I also need to think about adding
> back the old 'strict' flag in some way.
> =============================================================================
parent d4079e1f
...@@ -22,6 +22,75 @@ except NameError: ...@@ -22,6 +22,75 @@ except NameError:
NLCRE = re.compile('\r\n|\r|\n') NLCRE = re.compile('\r\n|\r|\n')
class TextUtil:
""" A utility class for wrapping a file object and providing a
couple of additional useful functions.
"""
def __init__(self, fp):
self.fp = fp
self.unread = []
def readline(self):
""" Return a line of data.
If data has been pushed back with unreadline(), the most recently
returned unreadline()d data will be returned.
"""
if self.unread:
return self.unread.pop()
else:
return self.fp.readline()
def unreadline(self, line):
"""Push a line back into the object.
"""
self.unread.append(line)
def peekline(self):
"""Non-destructively look at the next line"""
line = self.readline()
self.unreadline(line)
return line
def read(self):
"""Return the remaining data
"""
r = self.fp.read()
if self.unread:
r = "\n".join(self.unread) + r
self.unread = []
return r
def readuntil(self, re, afterblank=0, includematch=0):
"""Read a line at a time until we get the specified RE.
Returns the text up to (and including, if includematch is true) the
matched text, and the RE match object. If afterblank is true,
there must be a blank line before the matched text. Moves current
filepointer to the line following the matched line. If we reach
end-of-file, return what we've got so far, and return None as the
RE match object.
"""
prematch = []
blankseen = 0
while 1:
line = self.readline()
if not line:
# end of file
return EMPTYSTRING.join(prematch), None
if afterblank:
if NLCRE.match(line):
blankseen = 1
continue
else:
blankseen = 0
m = re.match(line)
if (m and not afterblank) or (m and afterblank and blankseen):
if includematch:
prematch.append(line)
return EMPTYSTRING.join(prematch), m
prematch.append(line)
class Parser: class Parser:
...@@ -59,9 +128,13 @@ class Parser: ...@@ -59,9 +128,13 @@ class Parser:
meaning it parses the entire contents of the file. meaning it parses the entire contents of the file.
""" """
root = self._class() root = self._class()
firstbodyline = self._parseheaders(root, fp) fp = TextUtil(fp)
self._parseheaders(root, fp)
if not headersonly: if not headersonly:
self._parsebody(root, fp, firstbodyline) obj = self._parsemessage(root, fp)
trailer = fp.read()
if obj and trailer:
self._attach_trailer(obj, trailer)
return root return root
def parsestr(self, text, headersonly=False): def parsestr(self, text, headersonly=False):
...@@ -80,7 +153,6 @@ class Parser: ...@@ -80,7 +153,6 @@ class Parser:
lastheader = '' lastheader = ''
lastvalue = [] lastvalue = []
lineno = 0 lineno = 0
firstbodyline = None
while True: while True:
# Don't strip the line before we test for the end condition, # Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant # because whitespace-only header lines are RFC compliant
...@@ -129,7 +201,7 @@ class Parser: ...@@ -129,7 +201,7 @@ class Parser:
# There was no separating blank line as mandated by RFC # There was no separating blank line as mandated by RFC
# 2822, but we're in non-strict mode. So just offer up # 2822, but we're in non-strict mode. So just offer up
# this current line as the first body line. # this current line as the first body line.
firstbodyline = line fp.unreadline(line)
break break
if lastheader: if lastheader:
container[lastheader] = NL.join(lastvalue) container[lastheader] = NL.join(lastvalue)
...@@ -138,140 +210,114 @@ class Parser: ...@@ -138,140 +210,114 @@ class Parser:
# Make sure we retain the last header # Make sure we retain the last header
if lastheader: if lastheader:
container[lastheader] = NL.join(lastvalue) container[lastheader] = NL.join(lastvalue)
return firstbodyline return
def _parsebody(self, container, fp, firstbodyline=None): def _parsemessage(self, container, fp):
# Parse the body, but first split the payload on the content-type # Parse the body. We walk through the body from top to bottom,
# boundary if present. # keeping track of the current multipart nesting as we go.
# We return the object that gets the data at the end of this
# block.
boundary = container.get_boundary() boundary = container.get_boundary()
isdigest = (container.get_content_type() == 'multipart/digest') isdigest = (container.get_content_type() == 'multipart/digest')
# If there's a boundary, split the payload text into its constituent if boundary:
# parts and parse each separately. Otherwise, just parse the rest of
# the body as a single message. Note: any exceptions raised in the
# recursive parse need to have their line numbers coerced.
if boundary:
preamble = epilogue = None
# Split into subparts. The first boundary we're looking for won't
# always have a leading newline since we're at the start of the
# body text, and there's not always a preamble before the first
# boundary.
separator = '--' + boundary separator = '--' + boundary
payload = fp.read() boundaryRE = re.compile(
if firstbodyline is not None: r'(?P<sep>' + re.escape(separator) +
payload = firstbodyline + '\n' + payload r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
# We use an RE here because boundaries can have trailing preamble, matchobj = fp.readuntil(boundaryRE)
# whitespace. if not matchobj:
mo = re.search( # Broken - we hit the end of file. Just set the body
r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)', # to the text.
payload) container.set_payload(preamble)
if not mo: return container
if self._strict: if preamble:
raise Errors.BoundaryError( container.preamble = preamble
"Couldn't find starting boundary: %s" % boundary)
container.set_payload(payload)
return
start = mo.start()
if start > 0:
# there's some pre-MIME boundary preamble
preamble = payload[0:start]
# Find out what kind of line endings we're using
start += len(mo.group('sep')) + len(mo.group('ws'))
mo = NLCRE.search(payload, start)
if mo:
start += len(mo.group(0))
# We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't
# support this signature. :(
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
re.escape(separator) + '--')
mo = cre.search(payload, start)
if mo:
terminator = mo.start()
linesep = mo.group('sep')
if mo.end() < len(payload):
# There's some post-MIME boundary epilogue
epilogue = payload[mo.end():]
elif self._strict:
raise Errors.BoundaryError(
"Couldn't find terminating boundary: %s" % boundary)
else: else:
# Handle the case of no trailing boundary. Check that it ends # The module docs specify an empty preamble is None, not ''
# in a blank line. Some cases (spamspamspam) don't even have container.preamble = None
# that! while 1:
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload) subobj = self._class()
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
terminator = len(payload)
# We split the textual payload on the boundary separator, which
# includes the trailing newline. If the container is a
# multipart/digest then the subparts are by default message/rfc822
# instead of text/plain. In that case, they'll have a optional
# block of MIME headers, then an empty line followed by the
# message headers.
parts = re.split(
linesep + re.escape(separator) + r'[ \t]*' + linesep,
payload[start:terminator])
for part in parts:
if isdigest: if isdigest:
if part.startswith(linesep): subobj.set_default_type('message/rfc822')
# There's no header block so create an empty message firstline = fp.peekline()
# object as the container, and lop off the newline so if firstline.strip():
# we can parse the sub-subobject # we have MIME headers. all good.
msgobj = self._class() self._parseheaders(subobj, fp)
part = part[len(linesep):]
else: else:
parthdrs, part = part.split(linesep+linesep, 1) # no MIME headers. this is allowed for multipart/digest
# msgobj in this case is the "message/rfc822" container # Consume the extra blank line
msgobj = self.parsestr(parthdrs, headersonly=1) fp.readline()
# while submsgobj is the message itself pass
msgobj.set_default_type('message/rfc822')
maintype = msgobj.get_content_maintype()
if maintype in ('message', 'multipart'):
submsgobj = self.parsestr(part)
msgobj.attach(submsgobj)
else:
msgobj.set_payload(part)
else: else:
msgobj = self.parsestr(part) self._parseheaders(subobj, fp)
container.preamble = preamble container.attach(subobj)
container.epilogue = epilogue maintype = subobj.get_content_maintype()
container.attach(msgobj) hassubparts = (subobj.get_content_maintype() in
elif container.get_main_type() == 'multipart': ( "message", "multipart" ))
if hassubparts:
subobj = self._parsemessage(subobj, fp)
trailer, matchobj = fp.readuntil(boundaryRE)
if matchobj is None or trailer:
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
trailer = trailer[:-len(linesep)]
if trailer:
self._attach_trailer(subobj, trailer)
if matchobj is None or matchobj.group('end'):
# That was the last piece of data. Let our caller attach
# the epilogue to us. But before we do that, push the
# line ending of the match group back into the readline
# buffer, as it's part of the epilogue.
if matchobj:
fp.unreadline(matchobj.group('linesep'))
return container
elif container.get_content_maintype() == "multipart":
# Very bad. A message is a multipart with no boundary! # Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError( raise Errors.BoundaryError(
'multipart message with no defined boundary') 'multipart message with no defined boundary')
elif container.get_type() == 'message/delivery-status': elif container.get_content_maintype() == "message":
# This special kind of type contains blocks of headers separated ct = container.get_content_type()
# by a blank line. We'll represent each header block as a if ct == "message/rfc822":
# separate Message object submessage = self._class()
blocks = [] self._parseheaders(submessage, fp)
while True: self._parsemessage(submessage, fp)
blockmsg = self._class() container.attach(submessage)
self._parseheaders(blockmsg, fp) return submessage
if not len(blockmsg): elif ct == "message/delivery-status":
# No more header blocks left # This special kind of type contains blocks of headers
break # separated by a blank line. We'll represent each header
blocks.append(blockmsg) # block as a separate Message object
container.set_payload(blocks) while 1:
elif container.get_main_type() == 'message': nextblock = self._class()
# Create a container for the payload, but watch out for there not self._parseheaders(nextblock, fp)
# being any headers left container.attach(nextblock)
try: # next peek ahead to see whether we've hit the end or not
msg = self.parse(fp) nextline = fp.peekline()
except Errors.HeaderParseError: if nextline[:2] == "--":
break
return container
else:
# Other sort of message object (e.g. external-body)
msg = self._class() msg = self._class()
self._parsebody(msg, fp) self._parsemessage(msg, fp)
container.attach(msg) container.attach(msg)
return msg
else: else:
text = fp.read() # single body section. We let our caller set the payload.
if firstbodyline is not None: return container
text = firstbodyline + '\n' + text
container.set_payload(text)
def _attach_trailer(self, obj, trailer):
if obj.get_content_maintype() in ("message", "multipart"):
obj.epilogue = trailer
else:
obj.set_payload(trailer)
class HeaderParser(Parser): class HeaderParser(Parser):
...@@ -284,9 +330,8 @@ class HeaderParser(Parser): ...@@ -284,9 +330,8 @@ class HeaderParser(Parser):
Parsing with this subclass can be considerably faster if all you're Parsing with this subclass can be considerably faster if all you're
interested in is the message headers. interested in is the message headers.
""" """
def _parsebody(self, container, fp, firstbodyline=None): def _parsemessage(self, container, fp):
# Consume but do not parse, the body # Consume but do not parse, the body
text = fp.read() text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text) container.set_payload(text)
return None
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment