Commit 320a1c0f authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.

Original patch by Raymond Hettinger.
parent 6f201707
...@@ -50,8 +50,8 @@ class BufferedSubFile(object): ...@@ -50,8 +50,8 @@ class BufferedSubFile(object):
simple abstraction -- it parses until EOF closes the current message. simple abstraction -- it parses until EOF closes the current message.
""" """
def __init__(self): def __init__(self):
# The last partial line pushed into this object. # Chunks of the last partial line pushed into this object.
self._partial = '' self._partial = []
# The list of full, pushed lines, in reverse order # The list of full, pushed lines, in reverse order
self._lines = [] self._lines = []
# The stack of false-EOF checking predicates. # The stack of false-EOF checking predicates.
...@@ -67,8 +67,8 @@ class BufferedSubFile(object): ...@@ -67,8 +67,8 @@ class BufferedSubFile(object):
def close(self): def close(self):
# Don't forget any trailing partial line. # Don't forget any trailing partial line.
self._lines.append(self._partial) self.pushlines(''.join(self._partial).splitlines(True))
self._partial = '' self._partial = []
self._closed = True self._closed = True
def readline(self): def readline(self):
...@@ -96,16 +96,26 @@ class BufferedSubFile(object): ...@@ -96,16 +96,26 @@ class BufferedSubFile(object):
def push(self, data): def push(self, data):
"""Push some new data into this object.""" """Push some new data into this object."""
# Handle any previous leftovers
data, self._partial = self._partial + data, ''
# Crack into lines, but preserve the linesep characters on the end of each # Crack into lines, but preserve the linesep characters on the end of each
parts = data.splitlines(True) parts = data.splitlines(True)
if not parts or not parts[0].endswith(('\n', '\r')):
# No new complete lines, so just accumulate partials
self._partial += parts
return
if self._partial:
# If there are previous leftovers, complete them now
self._partial.append(parts[0])
parts[0:1] = ''.join(self._partial).splitlines(True)
del self._partial[:]
# If the last element of the list does not end in a newline, then treat # If the last element of the list does not end in a newline, then treat
# it as a partial line. We only check for '\n' here because a line # it as a partial line. We only check for '\n' here because a line
# ending with '\r' might be a line that was split in the middle of a # ending with '\r' might be a line that was split in the middle of a
# '\r\n' sequence (see bugs 1555570 and 1721862). # '\r\n' sequence (see bugs 1555570 and 1721862).
if parts and not parts[-1].endswith('\n'): if not parts[-1].endswith('\n'):
self._partial = parts.pop() self._partial = [parts.pop()]
self.pushlines(parts) self.pushlines(parts)
def pushlines(self, lines): def pushlines(self, lines):
......
...@@ -10,6 +10,7 @@ import textwrap ...@@ -10,6 +10,7 @@ import textwrap
from io import StringIO, BytesIO from io import StringIO, BytesIO
from itertools import chain from itertools import chain
from random import choice
import email import email
import email.policy import email.policy
...@@ -3353,16 +3354,70 @@ Do you like this message? ...@@ -3353,16 +3354,70 @@ Do you like this message?
bsf.push(il) bsf.push(il)
nt += n nt += n
n1 = 0 n1 = 0
while True: for ol in iter(bsf.readline, NeedMoreData):
ol = bsf.readline()
if ol == NeedMoreData:
break
om.append(ol) om.append(ol)
n1 += 1 n1 += 1
self.assertEqual(n, n1) self.assertEqual(n, n1)
self.assertEqual(len(om), nt) self.assertEqual(len(om), nt)
self.assertEqual(''.join([il for il, n in imt]), ''.join(om)) self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
def test_push_random(self):
from email.feedparser import BufferedSubFile, NeedMoreData
n = 10000
chunksize = 5
chars = 'abcd \t\r\n'
s = ''.join(choice(chars) for i in range(n)) + '\n'
target = s.splitlines(True)
bsf = BufferedSubFile()
lines = []
for i in range(0, len(s), chunksize):
chunk = s[i:i+chunksize]
bsf.push(chunk)
lines.extend(iter(bsf.readline, NeedMoreData))
self.assertEqual(lines, target)
class TestFeedParsers(TestEmailBase):
def parse(self, chunks):
from email.feedparser import FeedParser
feedparser = FeedParser()
for chunk in chunks:
feedparser.feed(chunk)
return feedparser.close()
def test_newlines(self):
m = self.parse(['a:\nb:\rc:\r\nd:\n'])
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
m = self.parse(['a:\nb:\rc:\r\nd:'])
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
m = self.parse(['a:\rb', 'c:\n'])
self.assertEqual(m.keys(), ['a', 'bc'])
m = self.parse(['a:\r', 'b:\n'])
self.assertEqual(m.keys(), ['a', 'b'])
m = self.parse(['a:\r', '\nb:\n'])
self.assertEqual(m.keys(), ['a', 'b'])
m = self.parse(['a:\x85b:\u2028c:\n'])
self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
def test_long_lines(self):
M, N = 1000, 100000
m = self.parse(['a:b\n\n'] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', 'b')])
self.assertEqual(m.get_payload(), 'x'*M*N)
m = self.parse(['a:b\r\r'] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', 'b')])
self.assertEqual(m.get_payload(), 'x'*M*N)
m = self.parse(['a:b\r\r'] + ['x'*M+'\x85'] * N)
self.assertEqual(m.items(), [('a', 'b')])
self.assertEqual(m.get_payload(), ('x'*M+'\x85')*N)
m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
class TestParsers(TestEmailBase): class TestParsers(TestEmailBase):
......
...@@ -27,6 +27,9 @@ Core and Builtins ...@@ -27,6 +27,9 @@ Core and Builtins
Library Library
------- -------
- Issue #21448: Changed FeedParser feed() to avoid O(N**2) behavior when
parsing long line. Original patch by Raymond Hettinger.
- Issue #17923: glob() patterns ending with a slash no longer match non-dirs on - Issue #17923: glob() patterns ending with a slash no longer match non-dirs on
AIX. Based on patch by Delhallt. AIX. Based on patch by Delhallt.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment