Merge in Anthony's new parser code, from the anthony-parser-branch:

> ---------------------------- > revision 1.20.4.4 > date: 2003/06/12 09:14:17; author: anthonybaxter; state: Exp; lines: +13 -6 > preamble is None when missing, not ''. > Handle a couple of bogus formatted messages - now parses my main testsuite. > Handle message/external-body. > ---------------------------- > revision 1.20.4.3 > date: 2003/06/12 07:16:40; author: anthonybaxter; state: Exp; lines: +6 -4 > epilogue-processing is now the same as the old parser - the newline at the > end of the line with the --endboundary-- is included as part of the epilogue. > Note that any whitespace after the boundary is _not_ part of the epilogue. > ---------------------------- > revision 1.20.4.2 > date: 2003/06/12 06:39:09; author: anthonybaxter; state: Exp; lines: +6 -4 > message/delivery-status fixed. > HeaderParser fixed. > ---------------------------- > revision 1.20.4.1 > date: 2003/06/12 06:08:56; author: anthonybaxter; state: Exp; lines: +163 -129 > A work-in-progress snapshot of the new parser. A couple of known problems: > > - first (blank) line of MIME epilogues is being consumed > - message/delivery-status isn't quite right > > It still needs a lot of cleanup, but right now it parses a whole lot of > badness that the old parser failed on. I also need to think about adding > back the old 'strict' flag in some way. > =============================================================================

Merge in Anthony's new parser code, from the anthony-parser-branch:
> ---------------------------- > revision 1.20.4.4 > date: 2003/06/12 09:14:17; author: anthonybaxter; state: Exp; lines: +13 -6 > preamble is None when missing, not ''. > Handle a couple of bogus formatted messages - now parses my main testsuite. > Handle message/external-body. > ---------------------------- > revision 1.20.4.3 > date: 2003/06/12 07:16:40; author: anthonybaxter; state: Exp; lines: +6 -4 > epilogue-processing is now the same as the old parser - the newline at the > end of the line with the --endboundary-- is included as part of the epilogue. > Note that any whitespace after the boundary is _not_ part of the epilogue. > ---------------------------- > revision 1.20.4.2 > date: 2003/06/12 06:39:09; author: anthonybaxter; state: Exp; lines: +6 -4 > message/delivery-status fixed. > HeaderParser fixed. > ---------------------------- > revision 1.20.4.1 > date: 2003/06/12 06:08:56; author: anthonybaxter; state: Exp; lines: +163 -129 > A work-in-progress snapshot of the new parser. A couple of known problems: > > - first (blank) line of MIME epilogues is being consumed > - message/delivery-status isn't quite right > > It still needs a lot of cleanup, but right now it parses a whole lot of > badness that the old parser failed on. I also need to think about adding > back the old 'strict' flag in some way. > =============================================================================
0813d76c · Thomas Wouters · d4079e1f · 0813d76c
Commit 0813d76c authored Mar 20, 2004 by Thomas Wouters
Hide whitespace changes
Inline Side-by-side

Showing with 173 additions and 128 deletions

Lib/email/Parser.py Lib/email/Parser.py +173 -128

No files found.
--- a/Lib/email/Parser.py
+++ b/Lib/email/Parser.py
@@ -22,6 +22,75 @@ except NameError:
 NLCRE = re.compile('\r\n|\r|\n')
+class TextUtil:
+    """ A utility class for wrapping a file object and providing a 
+        couple of additional useful functions.
+    """
+    def __init__(self, fp):
+        self.fp = fp
+        self.unread = []
+    def readline(self):
+        """ Return a line of data.
+        If data has been pushed back with unreadline(), the most recently
+        returned unreadline()d data will be returned.
+        """
+        if self.unread:
+            return self.unread.pop()
+        else:
+            return self.fp.readline()
+    def unreadline(self, line):
+        """Push a line back into the object. 
+        """
+        self.unread.append(line)
+    def peekline(self):
+        """Non-destructively look at the next line"""
+        line = self.readline()
+        self.unreadline(line)
+        return line
+    def read(self):
+        """Return the remaining data
+        """
+        r = self.fp.read()
+        if self.unread:
+            r = "\n".join(self.unread) + r
+            self.unread = []
+        return r
+    def readuntil(self, re, afterblank=0, includematch=0):
+        """Read a line at a time until we get the specified RE. 
+        Returns the text up to (and including, if includematch is true) the 
+        matched text, and the RE match object. If afterblank is true, 
+        there must be a blank line before the matched text. Moves current 
+        filepointer to the line following the matched line. If we reach 
+        end-of-file, return what we've got so far, and return None as the
+        RE match object.
+        """
+        prematch = []
+        blankseen = 0
+        while 1:
+            line = self.readline()
+            if not line:
+                # end of file
+                return EMPTYSTRING.join(prematch), None
+            if afterblank:
+                if NLCRE.match(line):
+                    blankseen = 1
+                    continue
+                else:
+                    blankseen = 0
+            m = re.match(line)
+            if (m and not afterblank) or (m and afterblank and blankseen):
+                if includematch:
+                    prematch.append(line)
+                return EMPTYSTRING.join(prematch), m
+            prematch.append(line)
 class Parser:
@@ -59,9 +128,13 @@ class Parser:
        meaning it parses the entire contents of the file.
        """
        root = self._class()
-        firstbodyline = self._parseheaders(root, fp)
+        fp = TextUtil(fp)
+        self._parseheaders(root, fp)
        if not headersonly:
-            self._parsebody(root, fp, firstbodyline)
+            obj = self._parsemessage(root, fp)
+            trailer = fp.read()
+            if obj and trailer:
+                self._attach_trailer(obj, trailer)
        return root
    def parsestr(self, text, headersonly=False):
@@ -80,7 +153,6 @@ class Parser:
        lastheader = ''
        lastvalue = []
        lineno = 0
-        firstbodyline = None
        while True:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
@@ -129,7 +201,7 @@ class Parser:
                    # There was no separating blank line as mandated by RFC
                    # 2822, but we're in non-strict mode.  So just offer up
                    # this current line as the first body line.
-                    firstbodyline = line
+                    fp.unreadline(line)
                    break
            if lastheader:
                container[lastheader] = NL.join(lastvalue)
@@ -138,140 +210,114 @@ class Parser:
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NL.join(lastvalue)
-        return firstbodyline
+        return 
-    def _parsebody(self, container, fp, firstbodyline=None):
+    def _parsemessage(self, container, fp):
-        # Parse the body, but first split the payload on the content-type
+        # Parse the body. We walk through the body from top to bottom,
-        # boundary if present.
+        # keeping track of the current multipart nesting as we go.
+        # We return the object that gets the data at the end of this 
+        # block.
        boundary = container.get_boundary()
        isdigest = (container.get_content_type() == 'multipart/digest')
-        # If there's a boundary, split the payload text into its constituent
+        if boundary: 
-        # parts and parse each separately.  Otherwise, just parse the rest of
-        # the body as a single message.  Note: any exceptions raised in the
-        # recursive parse need to have their line numbers coerced.
-        if boundary:
-            preamble = epilogue = None
-            # Split into subparts.  The first boundary we're looking for won't
-            # always have a leading newline since we're at the start of the
-            # body text, and there's not always a preamble before the first
-            # boundary.
            separator = '--' + boundary
-            payload = fp.read()
+            boundaryRE = re.compile(
-            if firstbodyline is not None:
+                    r'(?P<sep>' + re.escape(separator) + 
-                payload = firstbodyline + '\n' + payload
+                    r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
-            # We use an RE here because boundaries can have trailing
+            preamble, matchobj = fp.readuntil(boundaryRE)
-            # whitespace.
+            if not matchobj:
-            mo = re.search(
+                # Broken - we hit the end of file. Just set the body 
-                r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
+                # to the text.
-                payload)
+                container.set_payload(preamble)
-            if not mo:
+                return container
-                if self._strict:
+            if preamble:
-                    raise Errors.BoundaryError(
+                container.preamble = preamble
-                        "Couldn't find starting boundary: %s" % boundary)
-                container.set_payload(payload)
-                return
-            start = mo.start()
-            if start > 0:
-                # there's some pre-MIME boundary preamble
-                preamble = payload[0:start]
-            # Find out what kind of line endings we're using
-            start += len(mo.group('sep')) + len(mo.group('ws'))
-            mo = NLCRE.search(payload, start)
-            if mo:
-                start += len(mo.group(0))
-            # We create a compiled regexp first because we need to be able to
-            # specify the start position, and the module function doesn't
-            # support this signature. :(
-            cre = re.compile('(?P<sep>\r\n|\r|\n)' +
-                             re.escape(separator) + '--')
-            mo = cre.search(payload, start)
-            if mo:
-                terminator = mo.start()
-                linesep = mo.group('sep')
-                if mo.end() < len(payload):
-                    # There's some post-MIME boundary epilogue
-                    epilogue = payload[mo.end():]
-            elif self._strict:
-                raise Errors.BoundaryError(
-                        "Couldn't find terminating boundary: %s" % boundary)
            else:
-                # Handle the case of no trailing boundary.  Check that it ends
+                # The module docs specify an empty preamble is None, not ''
-                # in a blank line.  Some cases (spamspamspam) don't even have
+                container.preamble = None
-                # that!
+            while 1:
-                mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
+                subobj = self._class()
-                if not mo:
-                    mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
-                    if not mo:
-                        raise Errors.BoundaryError(
-                          'No terminating boundary and no trailing empty line')
-                linesep = mo.group('sep')
-                terminator = len(payload)
-            # We split the textual payload on the boundary separator, which
-            # includes the trailing newline. If the container is a
-            # multipart/digest then the subparts are by default message/rfc822
-            # instead of text/plain.  In that case, they'll have a optional
-            # block of MIME headers, then an empty line followed by the
-            # message headers.
-            parts = re.split(
-                linesep + re.escape(separator) + r'[ \t]*' + linesep,
-                payload[start:terminator])
-            for part in parts:
                if isdigest:
-                    if part.startswith(linesep):
+                    subobj.set_default_type('message/rfc822')
-                        # There's no header block so create an empty message
+                    firstline = fp.peekline()
-                        # object as the container, and lop off the newline so
+                    if firstline.strip():
-                        # we can parse the sub-subobject
+                        # we have MIME headers. all good. 
-                        msgobj = self._class()
+                        self._parseheaders(subobj, fp)
-                        part = part[len(linesep):]
                    else:
-                        parthdrs, part = part.split(linesep+linesep, 1)
+                        # no MIME headers. this is allowed for multipart/digest
-                        # msgobj in this case is the "message/rfc822" container
+                        # Consume the extra blank line
-                        msgobj = self.parsestr(parthdrs, headersonly=1)
+                        fp.readline()
-                    # while submsgobj is the message itself
+                        pass
-                    msgobj.set_default_type('message/rfc822')
-                    maintype = msgobj.get_content_maintype()
-                    if maintype in ('message', 'multipart'):
-                        submsgobj = self.parsestr(part)
-                        msgobj.attach(submsgobj)
-                    else:
-                        msgobj.set_payload(part)
                else:
-                    msgobj = self.parsestr(part)
+                    self._parseheaders(subobj, fp)
-                container.preamble = preamble
+                container.attach(subobj)
-                container.epilogue = epilogue
+                maintype = subobj.get_content_maintype()
-                container.attach(msgobj)
+                hassubparts = (subobj.get_content_maintype() in 
-        elif container.get_main_type() == 'multipart':
+                                                ( "message", "multipart" ))
+                if hassubparts:
+                    subobj = self._parsemessage(subobj, fp)
+                trailer, matchobj = fp.readuntil(boundaryRE)
+                if matchobj is None or trailer:
+                    mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
+                    if not mo:
+                        mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
+                        if not mo:
+                            raise Errors.BoundaryError(
+                          'No terminating boundary and no trailing empty line')
+                    linesep = mo.group('sep')
+                    trailer = trailer[:-len(linesep)]
+                if trailer:
+                    self._attach_trailer(subobj, trailer)
+                if matchobj is None or matchobj.group('end'):
+                    # That was the last piece of data. Let our caller attach
+                    # the epilogue to us. But before we do that, push the
+                    # line ending of the match group back into the readline
+                    # buffer, as it's part of the epilogue.
+                    if matchobj:
+                        fp.unreadline(matchobj.group('linesep'))
+                    return container
+        elif container.get_content_maintype() == "multipart":
            # Very bad.  A message is a multipart with no boundary!
            raise Errors.BoundaryError(
-                'multipart message with no defined boundary')
+                    'multipart message with no defined boundary')
-        elif container.get_type() == 'message/delivery-status':
+        elif container.get_content_maintype() == "message":
-            # This special kind of type contains blocks of headers separated
+            ct = container.get_content_type()
-            # by a blank line.  We'll represent each header block as a
+            if ct == "message/rfc822":
-            # separate Message object
+                submessage = self._class()
-            blocks = []
+                self._parseheaders(submessage, fp)
-            while True:
+                self._parsemessage(submessage, fp)
-                blockmsg = self._class()
+                container.attach(submessage)
-                self._parseheaders(blockmsg, fp)
+                return submessage
-                if not len(blockmsg):
+            elif ct == "message/delivery-status":
-                    # No more header blocks left
+                # This special kind of type contains blocks of headers 
-                    break
+                # separated by a blank line.  We'll represent each header 
-                blocks.append(blockmsg)
+                # block as a separate Message object
-            container.set_payload(blocks)
+                while 1:
-        elif container.get_main_type() == 'message':
+                    nextblock = self._class()
-            # Create a container for the payload, but watch out for there not
+                    self._parseheaders(nextblock, fp)
-            # being any headers left
+                    container.attach(nextblock)
-            try:
+                    # next peek ahead to see whether we've hit the end or not
-                msg = self.parse(fp)
+                    nextline = fp.peekline()
-            except Errors.HeaderParseError:
+                    if nextline[:2] == "--":
+                        break
+                return container
+            else:
+                # Other sort of message object (e.g. external-body)
                msg = self._class()
-                self._parsebody(msg, fp)
+                self._parsemessage(msg, fp)
-            container.attach(msg)
+                container.attach(msg)
+                return msg
        else:
-            text = fp.read()
+            # single body section. We let our caller set the payload.
-            if firstbodyline is not None:
+            return container
-                text = firstbodyline + '\n' + text
-            container.set_payload(text)
+    def _attach_trailer(self, obj, trailer):
+        if obj.get_content_maintype() in ("message", "multipart"):
+            obj.epilogue = trailer
+        else:
+            obj.set_payload(trailer)
 class HeaderParser(Parser):
@@ -284,9 +330,8 @@ class HeaderParser(Parser):
    Parsing with this subclass can be considerably faster if all you're
    interested in is the message headers.
    """
-    def _parsebody(self, container, fp, firstbodyline=None):
+    def _parsemessage(self, container, fp):
        # Consume but do not parse, the body
        text = fp.read()
-        if firstbodyline is not None:
-            text = firstbodyline + '\n' + text
        container.set_payload(text)
+        return None