Commit 9b76da6a authored by Guido van Rossum's avatar Guido van Rossum

Checkpoint so I can continue to work on this at a different box.

There is somewhat working (but slow) code supporting seek/tell for text files,
but extensive testing exposes a bug I can't nail down.
parent 8742977b
...@@ -13,8 +13,9 @@ variable are part of the specification. ...@@ -13,8 +13,9 @@ variable are part of the specification.
XXX need to default buffer size to 1 if isatty() XXX need to default buffer size to 1 if isatty()
XXX need to support 1 meaning line-buffered XXX need to support 1 meaning line-buffered
XXX change behavior of blocking I/O
XXX don't use assert to validate input requirements XXX don't use assert to validate input requirements
XXX whenever an argument is None, use the default value
XXX read/write ops should check readable/writable
""" """
__author__ = ("Guido van Rossum <guido@python.org>, " __author__ = ("Guido van Rossum <guido@python.org>, "
...@@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO", ...@@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO",
import os import os
import sys import sys
import codecs import codecs
import pickle
import _fileio import _fileio
import warnings import warnings
# XXX Shouldn't we use st_blksize whenever we can?
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
...@@ -44,18 +47,22 @@ class BlockingIOError(IOError): ...@@ -44,18 +47,22 @@ class BlockingIOError(IOError):
self.characters_written = characters_written self.characters_written = characters_written
def open(file, mode="r", buffering=None, *, encoding=None): def open(file, mode="r", buffering=None, *, encoding=None, newline=None):
"""Replacement for the built-in open function. """Replacement for the built-in open function.
Args: Args:
file: string giving the name of the file to be opened; file: string giving the name of the file to be opened;
or integer file descriptor of the file to be wrapped (*) or integer file descriptor of the file to be wrapped (*).
mode: optional mode string; see below mode: optional mode string; see below.
buffering: optional int >= 0 giving the buffer size; values buffering: optional int >= 0 giving the buffer size; values
can be: 0 = unbuffered, 1 = line buffered, can be: 0 = unbuffered, 1 = line buffered,
larger = fully buffered larger = fully buffered.
encoding: optional string giving the text encoding (*must* be given Keywords (for text modes only; *must* be given as keyword arguments):
as a keyword argument) encoding: optional string giving the text encoding.
newline: optional newlines specifier; must be None, '\n' or '\r\n';
specifies the line ending expected on input and written on
output. If None, use universal newlines on input and
use os.linesep on output.
(*) If a file descriptor is given, it is closed when the returned (*) If a file descriptor is given, it is closed when the returned
I/O object is closed. If you don't want this to happen, use I/O object is closed. If you don't want this to happen, use
...@@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None): ...@@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
binary stream, a buffered binary stream, or a buffered text binary stream, a buffered binary stream, or a buffered text
stream, open for reading and/or writing. stream, open for reading and/or writing.
""" """
# XXX Don't use asserts for these checks; raise TypeError or ValueError
assert isinstance(file, (basestring, int)), repr(file) assert isinstance(file, (basestring, int)), repr(file)
assert isinstance(mode, basestring), repr(mode) assert isinstance(mode, basestring), repr(mode)
assert buffering is None or isinstance(buffering, int), repr(buffering) assert buffering is None or isinstance(buffering, int), repr(buffering)
...@@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None): ...@@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None):
if not (reading or writing or appending): if not (reading or writing or appending):
raise ValueError("must have exactly one of read/write/append mode") raise ValueError("must have exactly one of read/write/append mode")
if binary and encoding is not None: if binary and encoding is not None:
raise ValueError("binary mode doesn't take an encoding") raise ValueError("binary mode doesn't take an encoding argument")
if binary and newline is not None:
raise ValueError("binary mode doesn't take a newline argument")
raw = FileIO(file, raw = FileIO(file,
(reading and "r" or "") + (reading and "r" or "") +
(writing and "w" or "") + (writing and "w" or "") +
...@@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None): ...@@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
buffer = BufferedReader(raw, buffering) buffer = BufferedReader(raw, buffering)
if binary: if binary:
return buffer return buffer
# XXX What about newline conventions? return TextIOWrapper(buffer, encoding, newline)
textio = TextIOWrapper(buffer, encoding)
return textio
class IOBase: class IOBase:
...@@ -795,6 +803,8 @@ class TextIOBase(IOBase): ...@@ -795,6 +803,8 @@ class TextIOBase(IOBase):
"""Base class for text I/O. """Base class for text I/O.
This class provides a character and line based interface to stream I/O. This class provides a character and line based interface to stream I/O.
There is no readinto() method, as character strings are immutable.
""" """
def read(self, n: int = -1) -> str: def read(self, n: int = -1) -> str:
...@@ -805,10 +815,18 @@ class TextIOBase(IOBase): ...@@ -805,10 +815,18 @@ class TextIOBase(IOBase):
""" """
self._unsupported("read") self._unsupported("read")
def write(self, s: str): def write(self, s: str) -> int:
"""write(s: str) -> None. Write string s to stream.""" """write(s: str) -> int. Write string s to stream."""
self._unsupported("write") self._unsupported("write")
def truncate(self, pos: int = None) -> int:
"""truncate(pos: int = None) -> int. Truncate size to pos."""
self.flush()
if pos is None:
pos = self.tell()
self.seek(pos)
return self.buffer.truncate()
def readline(self) -> str: def readline(self) -> str:
"""readline() -> str. Read until newline or EOF. """readline() -> str. Read until newline or EOF.
...@@ -816,12 +834,12 @@ class TextIOBase(IOBase): ...@@ -816,12 +834,12 @@ class TextIOBase(IOBase):
""" """
self._unsupported("readline") self._unsupported("readline")
def __iter__(self): def __iter__(self) -> "TextIOBase": # That's a forward reference
"""__iter__() -> Iterator. Return line iterator (actually just self). """__iter__() -> Iterator. Return line iterator (actually just self).
""" """
return self return self
def next(self): def next(self) -> str:
"""Same as readline() except raises StopIteration on immediate EOF.""" """Same as readline() except raises StopIteration on immediate EOF."""
line = self.readline() line = self.readline()
if not line: if not line:
...@@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase): ...@@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase):
Character and line based layer over a BufferedIOBase object. Character and line based layer over a BufferedIOBase object.
""" """
# XXX tell(), seek() _CHUNK_SIZE = 64
def __init__(self, buffer, encoding=None, newline=None): def __init__(self, buffer, encoding=None, newline=None):
if newline not in (None, '\n', '\r\n'): if newline not in (None, "\n", "\r\n"):
raise IOError("illegal newline %s" % newline) # XXX: ValueError? raise ValueError("illegal newline value: %r" % (newline,))
if encoding is None: if encoding is None:
# XXX This is questionable # XXX This is questionable
encoding = sys.getfilesystemencoding() or "latin-1" encoding = sys.getfilesystemencoding() or "latin-1"
...@@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase): ...@@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase):
self._newline = newline or os.linesep self._newline = newline or os.linesep
self._fix_newlines = newline is None self._fix_newlines = newline is None
self._decoder = None self._decoder = None
self._pending = '' self._decoder_in_rest_pickle = None
self._pending = ""
self._snapshot = None
self._seekable = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a
# tuple (position, decoder_pickle, readahead) where position is a
# position of the underlying buffer, decoder_pickle is a pickled
# decoder state, and readahead is the chunk of bytes that was read
# from that position. We use this to reconstruct intermediate
# decoder states in tell().
def _seekable(self):
return self._seekable
def flush(self): def flush(self):
self.buffer.flush() self.buffer.flush()
...@@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase): ...@@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase):
return self.buffer.fileno() return self.buffer.fileno()
def write(self, s: str): def write(self, s: str):
# XXX What if we were just reading?
b = s.encode(self._encoding) b = s.encode(self._encoding)
if isinstance(b, str): if isinstance(b, str):
b = bytes(b) b = bytes(b)
n = self.buffer.write(b) n = self.buffer.write(b)
if "\n" in s: if "\n" in s:
self.flush() self.flush()
return n self._snapshot = self._decoder = None
return len(s)
def _get_decoder(self): def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding) make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None: if make_decoder is None:
raise IOError(".readline() not supported for encoding %s" % raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding) self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder): if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings # XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b"" decoder.buffer = b""
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
return decoder return decoder
def _read_chunk(self):
if not self._seekable:
return self.buffer.read(self._CHUNK_SIZE)
assert self._decoder is not None
position = self.buffer.tell()
decoder_state = pickle.dumps(self._decoder, 2)
readahead = self.buffer.read(self._CHUNK_SIZE)
self._snapshot = (position, decoder_state, readahead)
return readahead
def _encode_decoder_state(self, ds, pos):
if ds == self._decoder_in_rest_pickle:
return pos
x = 0
for i in bytes(ds):
x = x<<8 | i
return (x<<64) | pos
def _decode_decoder_state(self, pos):
x, pos = divmod(pos, 1<<64)
if not x:
return None, pos
b = b""
while x:
b.append(x&0xff)
x >>= 8
return str(b[::-1]), pos
def tell(self):
if not self._seekable:
raise IOError("Underlying stream is not seekable")
self.flush()
if self._decoder is None or self._snapshot is None:
assert self._pending == ""
return self.buffer.tell()
position, decoder_state, readahead = self._snapshot
decoder = pickle.loads(decoder_state)
characters = ""
sequence = []
for i, b in enumerate(readahead):
c = decoder.decode(bytes([b]))
if c:
characters += c
sequence.append((characters, i+1, pickle.dumps(decoder, 2)))
for ch, i, st in sequence:
if ch + self._pending == characters:
return self._encode_decoder_state(st, position + i)
raise IOError("Can't reconstruct logical file position")
def seek(self, pos, whence=0):
if not self._seekable:
raise IOError("Underlying stream is not seekable")
if whence == 1:
if pos != 0:
raise IOError("Can't do nonzero cur-relative seeks")
return self.tell()
if whence == 2:
if pos != 0:
raise IOError("Can't do nonzero end-relative seeks")
self.flush()
pos = self.buffer.seek(0, 2)
self._snapshot = None
self._pending = ""
self._decoder = None
return pos
if whence != 0:
raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
(whence,))
if pos < 0:
raise ValueError("Negative seek position %r" % (pos,))
orig_pos = pos
ds, pos = self._decode_decoder_state(pos)
if not ds:
self.buffer.seek(pos)
self._snapshot = None
self._pending = ""
self._decoder = None
return pos
decoder = pickle.loads(ds)
self.buffer.seek(pos)
self._snapshot = (pos, ds, "")
self._pending = ""
self._decoder = None
return orig_pos
def read(self, n: int = -1): def read(self, n: int = -1):
decoder = self._decoder or self._get_decoder() decoder = self._decoder or self._get_decoder()
res = self._pending res = self._pending
if n < 0: if n < 0:
res += decoder.decode(self.buffer.read(), True) res += decoder.decode(self.buffer.read(), True)
self._pending = "" self._pending = ""
self._snapshot = None
return res return res
else: else:
while len(res) < n: while len(res) < n:
data = self.buffer.read(64) data = self._read_chunk()
res += decoder.decode(data, not data) res += decoder.decode(data, not data)
if not data: if not data:
break break
...@@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase): ...@@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase):
def readline(self, limit=None): def readline(self, limit=None):
if limit is not None: if limit is not None:
# XXX Hack to support limit arg # XXX Hack to support limit argument, for backwards compatibility
line = self.readline() line = self.readline()
if len(line) <= limit: if len(line) <= limit:
return line return line
...@@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase): ...@@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase):
# We've seen \r - is it standalone, \r\n or \r at end of line? # We've seen \r - is it standalone, \r\n or \r at end of line?
if endpos + 1 < len(line): if endpos + 1 < len(line):
if line[endpos+1] == '\n': if line[endpos+1] == "\n":
ending = "\r\n" ending = "\r\n"
else: else:
ending = "\r" ending = "\r"
...@@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase): ...@@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase):
# No line ending seen yet - get more data # No line ending seen yet - get more data
while True: while True:
data = self.buffer.read(64) data = self._read_chunk()
more_line = decoder.decode(data, not data) more_line = decoder.decode(data, not data)
if more_line or not data: if more_line or not data:
break break
......
...@@ -93,6 +93,32 @@ class IOTest(unittest.TestCase): ...@@ -93,6 +93,32 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.truncate(12), 12) self.assertEqual(f.truncate(12), 12)
self.assertEqual(f.tell(), 12) self.assertEqual(f.tell(), 12)
def read_ops(self, f, buffered=False):
data = f.read(5)
self.assertEqual(data, b"hello")
self.assertEqual(f.readinto(data), 5)
self.assertEqual(data, b" worl")
self.assertEqual(f.readinto(data), 2)
self.assertEqual(len(data), 5)
self.assertEqual(data[:2], b"d\n")
self.assertEqual(f.seek(0), 0)
self.assertEqual(f.read(20), b"hello world\n")
self.assertEqual(f.read(1), b"")
self.assertEqual(f.readinto(b"x"), 0)
self.assertEqual(f.seek(-6, 2), 6)
self.assertEqual(f.read(5), b"world")
self.assertEqual(f.read(0), b"")
self.assertEqual(f.readinto(b""), 0)
self.assertEqual(f.seek(-6, 1), 5)
self.assertEqual(f.read(5), b" worl")
self.assertEqual(f.tell(), 10)
if buffered:
f.seek(0)
self.assertEqual(f.read(), b"hello world\n")
f.seek(6)
self.assertEqual(f.read(), b"world\n")
self.assertEqual(f.read(), b"")
LARGE = 2**31 LARGE = 2**31
def large_file_ops(self, f): def large_file_ops(self, f):
...@@ -112,24 +138,6 @@ class IOTest(unittest.TestCase): ...@@ -112,24 +138,6 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.seek(-1, 2), self.LARGE) self.assertEqual(f.seek(-1, 2), self.LARGE)
self.assertEqual(f.read(2), b"x") self.assertEqual(f.read(2), b"x")
def read_ops(self, f):
data = f.read(5)
self.assertEqual(data, b"hello")
n = f.readinto(data)
self.assertEqual(n, 5)
self.assertEqual(data, b" worl")
n = f.readinto(data)
self.assertEqual(n, 2)
self.assertEqual(len(data), 5)
self.assertEqual(data[:2], b"d\n")
f.seek(0)
self.assertEqual(f.read(20), b"hello world\n")
f.seek(-6, 2)
self.assertEqual(f.read(5), b"world")
f.seek(-6, 1)
self.assertEqual(f.read(5), b" worl")
self.assertEqual(f.tell(), 10)
def test_raw_file_io(self): def test_raw_file_io(self):
f = io.open(test_support.TESTFN, "wb", buffering=0) f = io.open(test_support.TESTFN, "wb", buffering=0)
self.assertEqual(f.readable(), False) self.assertEqual(f.readable(), False)
...@@ -155,7 +163,7 @@ class IOTest(unittest.TestCase): ...@@ -155,7 +163,7 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.readable(), True) self.assertEqual(f.readable(), True)
self.assertEqual(f.writable(), False) self.assertEqual(f.writable(), False)
self.assertEqual(f.seekable(), True) self.assertEqual(f.seekable(), True)
self.read_ops(f) self.read_ops(f, True)
f.close() f.close()
def test_raw_bytes_io(self): def test_raw_bytes_io(self):
...@@ -164,7 +172,7 @@ class IOTest(unittest.TestCase): ...@@ -164,7 +172,7 @@ class IOTest(unittest.TestCase):
data = f.getvalue() data = f.getvalue()
self.assertEqual(data, b"hello world\n") self.assertEqual(data, b"hello world\n")
f = io.BytesIO(data) f = io.BytesIO(data)
self.read_ops(f) self.read_ops(f, True)
def test_large_file_ops(self): def test_large_file_ops(self):
# On Windows and Mac OSX this test comsumes large resources; It takes # On Windows and Mac OSX this test comsumes large resources; It takes
...@@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase): ...@@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase):
class TextIOWrapperTest(unittest.TestCase): class TextIOWrapperTest(unittest.TestCase):
## def tearDown(self):
## test_support.unlink(test_support.TESTFN)
def testNewlines(self): def testNewlines(self):
input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ] input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
...@@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase): ...@@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEquals(got_line, exp_line) self.assertEquals(got_line, exp_line)
self.assertEquals(len(got_lines), len(exp_lines)) self.assertEquals(len(got_lines), len(exp_lines))
# Systematic tests of the text I/O API
def testBasicIO(self):
for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65):
for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le":
f = io.open(test_support.TESTFN, "w+", encoding=enc)
f._CHUNK_SIZE = chunksize
self.assertEquals(f.write("abc"), 3)
f.close()
f = io.open(test_support.TESTFN, "r+", encoding=enc)
f._CHUNK_SIZE = chunksize
self.assertEquals(f.tell(), 0)
self.assertEquals(f.read(), "abc")
cookie = f.tell()
self.assertEquals(f.seek(0), 0)
self.assertEquals(f.read(2), "ab")
self.assertEquals(f.read(1), "c")
self.assertEquals(f.read(1), "")
self.assertEquals(f.read(), "")
self.assertEquals(f.tell(), cookie)
self.assertEquals(f.seek(0), 0)
self.assertEquals(f.seek(0, 2), cookie)
self.assertEquals(f.write("def"), 3)
self.assertEquals(f.seek(cookie), cookie)
self.assertEquals(f.read(), "def")
if enc.startswith("utf"):
self.multi_line_test(f, enc)
f.close()
def multi_line_test(self, f, enc):
f.seek(0)
f.truncate()
sample = u"s\xff\u0fff\uffff"
wlines = []
for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65,
100, 200, 300, 400, 500, 1000):
chars = []
for i in xrange(size):
chars.append(sample[i % len(sample)])
line = u"".join(chars) + "\n"
wlines.append((f.tell(), line))
f.write(line)
wendpos = f.tell()
f.seek(0)
rlines = []
while True:
pos = f.tell()
line = f.readline()
if not line:
rendpos = pos
break
rlines.append((pos, line))
self.assertEquals(rendpos, wendpos)
self.assertEquals(rlines, wlines)
# XXX Tests for open() # XXX Tests for open()
def test_main(): def test_main():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment