Commit 69b7f812 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #15068: Got rid of excessive buffering in the fileinput module.

The bufsize parameter is no longer used.
parent 2d80fb39
......@@ -60,6 +60,9 @@ The following function is the primary interface of this module:
.. versionchanged:: 2.5
Added the *mode* and *openhook* parameters.
.. versionchanged:: 2.7.12
The *bufsize* parameter is no longer used.
The following functions use the global state created by :func:`fileinput.input`;
if there is no active state, :exc:`RuntimeError` is raised.
......@@ -143,6 +146,9 @@ available for subclassing as well:
.. versionchanged:: 2.5
Added the *mode* and *openhook* parameters.
.. versionchanged:: 2.7.12
The *bufsize* parameter is no longer used.
**Optional in-place filtering:** if the keyword argument ``inplace=1`` is passed
to :func:`fileinput.input` or to the :class:`FileInput` constructor, the file is
moved to a backup file and standard output is directed to the input file (if a
......
......@@ -64,13 +64,6 @@ deleted when the output file is closed. In-place filtering is
disabled when standard input is read. XXX The current implementation
does not work for MS-DOS 8+3 filesystems.
Performance: this module is unfortunately one of the slower ways of
processing large numbers of input lines. Nevertheless, a significant
speed-up has been obtained by using readlines(bufsize) instead of
readline(). A new keyword argument, bufsize=N, is present on the
input() function and the FileInput() class to override the default
buffer size.
XXX Possible additions:
- optional getopt argument processing
......@@ -86,6 +79,7 @@ __all__ = ["input","close","nextfile","filename","lineno","filelineno",
_state = None
# No longer used
DEFAULT_BUFSIZE = 8*1024
def input(files=None, inplace=0, backup="", bufsize=0,
......@@ -207,17 +201,15 @@ class FileInput:
self._files = files
self._inplace = inplace
self._backup = backup
self._bufsize = bufsize or DEFAULT_BUFSIZE
self._savestdout = None
self._output = None
self._filename = None
self._lineno = 0
self._startlineno = 0
self._filelineno = 0
self._file = None
self._readline = self._start_readline
self._isstdin = False
self._backupfilename = None
self._buffer = []
self._bufindex = 0
# restrict mode argument to reading modes
if mode not in ('r', 'rU', 'U', 'rb'):
raise ValueError("FileInput opening mode must be one of "
......@@ -242,22 +234,18 @@ class FileInput:
return self
def next(self):
try:
line = self._buffer[self._bufindex]
except IndexError:
pass
else:
self._bufindex += 1
self._lineno += 1
line = self._readline()
if line:
self._filelineno += 1
return line
line = self.readline()
if not line:
if not self._file:
raise StopIteration
return line
self.nextfile()
# Recursive call
return self.next()
def __getitem__(self, i):
if i != self._lineno:
if i != self.lineno():
raise RuntimeError, "accessing lines out of order"
try:
return self.next()
......@@ -277,7 +265,8 @@ class FileInput:
output.close()
finally:
file = self._file
self._file = 0
self._file = None
self._readline = self._start_readline
try:
if file and not self._isstdin:
file.close()
......@@ -289,75 +278,72 @@ class FileInput:
except OSError: pass
self._isstdin = False
self._buffer = []
self._bufindex = 0
def readline(self):
try:
line = self._buffer[self._bufindex]
except IndexError:
pass
while 1:
line = self._readline()
if line:
self._filelineno += 1
return line
if not self._file:
return line
self.nextfile()
# repeat with next file
def _start_readline(self):
if not self._files:
return ""
self._filename = self._files[0]
self._files = self._files[1:]
self._startlineno = self.lineno()
self._filelineno = 0
self._file = None
self._isstdin = False
self._backupfilename = 0
if self._filename == '-':
self._filename = '<stdin>'
self._file = sys.stdin
self._isstdin = True
else:
self._bufindex += 1
self._lineno += 1
self._filelineno += 1
return line
if not self._file:
if not self._files:
return ""
self._filename = self._files[0]
self._files = self._files[1:]
self._filelineno = 0
self._file = None
self._isstdin = False
self._backupfilename = 0
if self._filename == '-':
self._filename = '<stdin>'
self._file = sys.stdin
self._isstdin = True
else:
if self._inplace:
self._backupfilename = (
self._filename + (self._backup or os.extsep+"bak"))
try: os.unlink(self._backupfilename)
except os.error: pass
# The next few lines may raise IOError
os.rename(self._filename, self._backupfilename)
self._file = open(self._backupfilename, self._mode)
if self._inplace:
self._backupfilename = (
self._filename + (self._backup or os.extsep+"bak"))
try: os.unlink(self._backupfilename)
except os.error: pass
# The next few lines may raise IOError
os.rename(self._filename, self._backupfilename)
self._file = open(self._backupfilename, self._mode)
try:
perm = os.fstat(self._file.fileno()).st_mode
except OSError:
self._output = open(self._filename, "w")
else:
fd = os.open(self._filename,
os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
perm)
self._output = os.fdopen(fd, "w")
try:
perm = os.fstat(self._file.fileno()).st_mode
if hasattr(os, 'chmod'):
os.chmod(self._filename, perm)
except OSError:
self._output = open(self._filename, "w")
else:
fd = os.open(self._filename,
os.O_CREAT | os.O_WRONLY | os.O_TRUNC,
perm)
self._output = os.fdopen(fd, "w")
try:
if hasattr(os, 'chmod'):
os.chmod(self._filename, perm)
except OSError:
pass
self._savestdout = sys.stdout
sys.stdout = self._output
pass
self._savestdout = sys.stdout
sys.stdout = self._output
else:
# This may raise IOError
if self._openhook:
self._file = self._openhook(self._filename, self._mode)
else:
# This may raise IOError
if self._openhook:
self._file = self._openhook(self._filename, self._mode)
else:
self._file = open(self._filename, self._mode)
self._buffer = self._file.readlines(self._bufsize)
self._bufindex = 0
if not self._buffer:
self.nextfile()
# Recursive call
return self.readline()
self._file = open(self._filename, self._mode)
self._readline = self._file.readline
return self._readline()
def filename(self):
return self._filename
def lineno(self):
return self._lineno
return self._startlineno + self._filelineno
def filelineno(self):
return self._filelineno
......
......@@ -5,7 +5,7 @@ Nick Mathewson
import unittest
from test.test_support import verbose, TESTFN, run_unittest
from test.test_support import unlink as safe_unlink
from test.test_support import unlink as safe_unlink, check_warnings
import sys, re
from StringIO import StringIO
from fileinput import FileInput, hook_encoded
......@@ -28,6 +28,42 @@ def remove_tempfiles(*names):
for name in names:
safe_unlink(name)
class LineReader:
def __init__(self):
self._linesread = []
@property
def linesread(self):
try:
return self._linesread[:]
finally:
self._linesread = []
def openhook(self, filename, mode):
self.it = iter(filename.splitlines(True))
return self
def readline(self, size=None):
line = next(self.it, '')
self._linesread.append(line)
return line
def readlines(self, hint=-1):
lines = []
size = 0
while True:
line = self.readline()
if not line:
return lines
lines.append(line)
size += len(line)
if size >= hint:
return lines
def close(self):
pass
class BufferSizesTests(unittest.TestCase):
def test_buffer_sizes(self):
# First, run the tests with default and teeny buffer size.
......@@ -228,7 +264,7 @@ class FileInputTests(unittest.TestCase):
f.write('\x80')
self.addCleanup(safe_unlink, TESTFN)
fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8)
fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'))
# The most likely failure is a UnicodeDecodeError due to the entire
# file being read when it shouldn't have been.
self.assertEqual(fi.readline(), u'A\n')
......@@ -239,6 +275,38 @@ class FileInputTests(unittest.TestCase):
list(fi)
fi.close()
def test_readline_buffering(self):
src = LineReader()
fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
self.assertEqual(src.linesread, [])
self.assertEqual(fi.readline(), 'line1\n')
self.assertEqual(src.linesread, ['line1\n'])
self.assertEqual(fi.readline(), 'line2')
self.assertEqual(src.linesread, ['line2'])
self.assertEqual(fi.readline(), 'line3\n')
self.assertEqual(src.linesread, ['', 'line3\n'])
self.assertEqual(fi.readline(), '')
self.assertEqual(src.linesread, [''])
self.assertEqual(fi.readline(), '')
self.assertEqual(src.linesread, [])
fi.close()
def test_iteration_buffering(self):
src = LineReader()
fi = FileInput(files=['line1\nline2', 'line3\n'], openhook=src.openhook)
self.assertEqual(src.linesread, [])
self.assertEqual(next(fi), 'line1\n')
self.assertEqual(src.linesread, ['line1\n'])
self.assertEqual(next(fi), 'line2')
self.assertEqual(src.linesread, ['line2'])
self.assertEqual(next(fi), 'line3\n')
self.assertEqual(src.linesread, ['', 'line3\n'])
self.assertRaises(StopIteration, next, fi)
self.assertEqual(src.linesread, [''])
self.assertRaises(StopIteration, next, fi)
self.assertEqual(src.linesread, [])
fi.close()
class Test_hook_encoded(unittest.TestCase):
"""Unit tests for fileinput.hook_encoded()"""
......
......@@ -58,6 +58,9 @@ Core and Builtins
Library
-------
- Issue #15068: Got rid of excessive buffering in the fileinput module.
The bufsize parameter is no longer used.
- Issue #2202: Fix UnboundLocalError in
AbstractDigestAuthHandler.get_algorithm_impls. Initial patch by Mathieu Dupuy.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment