Commit 9b76da6a authored by Guido van Rossum's avatar Guido van Rossum

Checkpoint so I can continue to work on this at a different box.

There is somewhat working (but slow) code supporting seek/tell for text files,
but extensive testing exposes a bug I can't nail down.
parent 8742977b
......@@ -13,8 +13,9 @@ variable are part of the specification.
XXX need to default buffer size to 1 if isatty()
XXX need to support 1 meaning line-buffered
XXX change behavior of blocking I/O
XXX don't use assert to validate input requirements
XXX whenever an argument is None, use the default value
XXX read/write ops should check readable/writable
"""
__author__ = ("Guido van Rossum <guido@python.org>, "
......@@ -29,9 +30,11 @@ __all__ = ["BlockingIOError", "open", "IOBase", "RawIOBase", "FileIO",
import os
import sys
import codecs
import pickle
import _fileio
import warnings
# XXX Shouldn't we use st_blksize whenever we can?
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
......@@ -44,18 +47,22 @@ class BlockingIOError(IOError):
self.characters_written = characters_written
def open(file, mode="r", buffering=None, *, encoding=None):
def open(file, mode="r", buffering=None, *, encoding=None, newline=None):
"""Replacement for the built-in open function.
Args:
file: string giving the name of the file to be opened;
or integer file descriptor of the file to be wrapped (*)
mode: optional mode string; see below
or integer file descriptor of the file to be wrapped (*).
mode: optional mode string; see below.
buffering: optional int >= 0 giving the buffer size; values
can be: 0 = unbuffered, 1 = line buffered,
larger = fully buffered
encoding: optional string giving the text encoding (*must* be given
as a keyword argument)
larger = fully buffered.
Keywords (for text modes only; *must* be given as keyword arguments):
encoding: optional string giving the text encoding.
newline: optional newlines specifier; must be None, '\n' or '\r\n';
specifies the line ending expected on input and written on
output. If None, use universal newlines on input and
use os.linesep on output.
(*) If a file descriptor is given, it is closed when the returned
I/O object is closed. If you don't want this to happen, use
......@@ -79,6 +86,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
binary stream, a buffered binary stream, or a buffered text
stream, open for reading and/or writing.
"""
# XXX Don't use asserts for these checks; raise TypeError or ValueError
assert isinstance(file, (basestring, int)), repr(file)
assert isinstance(mode, basestring), repr(mode)
assert buffering is None or isinstance(buffering, int), repr(buffering)
......@@ -101,7 +109,9 @@ def open(file, mode="r", buffering=None, *, encoding=None):
if not (reading or writing or appending):
raise ValueError("must have exactly one of read/write/append mode")
if binary and encoding is not None:
raise ValueError("binary mode doesn't take an encoding")
raise ValueError("binary mode doesn't take an encoding argument")
if binary and newline is not None:
raise ValueError("binary mode doesn't take a newline argument")
raw = FileIO(file,
(reading and "r" or "") +
(writing and "w" or "") +
......@@ -132,9 +142,7 @@ def open(file, mode="r", buffering=None, *, encoding=None):
buffer = BufferedReader(raw, buffering)
if binary:
return buffer
# XXX What about newline conventions?
textio = TextIOWrapper(buffer, encoding)
return textio
return TextIOWrapper(buffer, encoding, newline)
class IOBase:
......@@ -795,6 +803,8 @@ class TextIOBase(IOBase):
"""Base class for text I/O.
This class provides a character and line based interface to stream I/O.
There is no readinto() method, as character strings are immutable.
"""
def read(self, n: int = -1) -> str:
......@@ -805,10 +815,18 @@ class TextIOBase(IOBase):
"""
self._unsupported("read")
def write(self, s: str):
"""write(s: str) -> None. Write string s to stream."""
def write(self, s: str) -> int:
"""write(s: str) -> int. Write string s to stream."""
self._unsupported("write")
def truncate(self, pos: int = None) -> int:
"""truncate(pos: int = None) -> int. Truncate size to pos."""
self.flush()
if pos is None:
pos = self.tell()
self.seek(pos)
return self.buffer.truncate()
def readline(self) -> str:
"""readline() -> str. Read until newline or EOF.
......@@ -816,12 +834,12 @@ class TextIOBase(IOBase):
"""
self._unsupported("readline")
def __iter__(self):
def __iter__(self) -> "TextIOBase": # That's a forward reference
"""__iter__() -> Iterator. Return line iterator (actually just self).
"""
return self
def next(self):
def next(self) -> str:
"""Same as readline() except raises StopIteration on immediate EOF."""
line = self.readline()
if not line:
......@@ -855,11 +873,11 @@ class TextIOWrapper(TextIOBase):
Character and line based layer over a BufferedIOBase object.
"""
# XXX tell(), seek()
_CHUNK_SIZE = 64
def __init__(self, buffer, encoding=None, newline=None):
if newline not in (None, '\n', '\r\n'):
raise IOError("illegal newline %s" % newline) # XXX: ValueError?
if newline not in (None, "\n", "\r\n"):
raise ValueError("illegal newline value: %r" % (newline,))
if encoding is None:
# XXX This is questionable
encoding = sys.getfilesystemencoding() or "latin-1"
......@@ -869,7 +887,20 @@ class TextIOWrapper(TextIOBase):
self._newline = newline or os.linesep
self._fix_newlines = newline is None
self._decoder = None
self._pending = ''
self._decoder_in_rest_pickle = None
self._pending = ""
self._snapshot = None
self._seekable = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a
# tuple (position, decoder_pickle, readahead) where position is a
# position of the underlying buffer, decoder_pickle is a pickled
# decoder state, and readahead is the chunk of bytes that was read
# from that position. We use this to reconstruct intermediate
# decoder states in tell().
def _seekable(self):
return self._seekable
def flush(self):
self.buffer.flush()
......@@ -886,35 +917,124 @@ class TextIOWrapper(TextIOBase):
return self.buffer.fileno()
def write(self, s: str):
# XXX What if we were just reading?
b = s.encode(self._encoding)
if isinstance(b, str):
b = bytes(b)
n = self.buffer.write(b)
if "\n" in s:
self.flush()
return n
self._snapshot = self._decoder = None
return len(s)
def _get_decoder(self):
make_decoder = codecs.getincrementaldecoder(self._encoding)
if make_decoder is None:
raise IOError(".readline() not supported for encoding %s" %
raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b""
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
return decoder
def _read_chunk(self):
if not self._seekable:
return self.buffer.read(self._CHUNK_SIZE)
assert self._decoder is not None
position = self.buffer.tell()
decoder_state = pickle.dumps(self._decoder, 2)
readahead = self.buffer.read(self._CHUNK_SIZE)
self._snapshot = (position, decoder_state, readahead)
return readahead
def _encode_decoder_state(self, ds, pos):
if ds == self._decoder_in_rest_pickle:
return pos
x = 0
for i in bytes(ds):
x = x<<8 | i
return (x<<64) | pos
def _decode_decoder_state(self, pos):
x, pos = divmod(pos, 1<<64)
if not x:
return None, pos
b = b""
while x:
b.append(x&0xff)
x >>= 8
return str(b[::-1]), pos
def tell(self):
if not self._seekable:
raise IOError("Underlying stream is not seekable")
self.flush()
if self._decoder is None or self._snapshot is None:
assert self._pending == ""
return self.buffer.tell()
position, decoder_state, readahead = self._snapshot
decoder = pickle.loads(decoder_state)
characters = ""
sequence = []
for i, b in enumerate(readahead):
c = decoder.decode(bytes([b]))
if c:
characters += c
sequence.append((characters, i+1, pickle.dumps(decoder, 2)))
for ch, i, st in sequence:
if ch + self._pending == characters:
return self._encode_decoder_state(st, position + i)
raise IOError("Can't reconstruct logical file position")
def seek(self, pos, whence=0):
if not self._seekable:
raise IOError("Underlying stream is not seekable")
if whence == 1:
if pos != 0:
raise IOError("Can't do nonzero cur-relative seeks")
return self.tell()
if whence == 2:
if pos != 0:
raise IOError("Can't do nonzero end-relative seeks")
self.flush()
pos = self.buffer.seek(0, 2)
self._snapshot = None
self._pending = ""
self._decoder = None
return pos
if whence != 0:
raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
(whence,))
if pos < 0:
raise ValueError("Negative seek position %r" % (pos,))
orig_pos = pos
ds, pos = self._decode_decoder_state(pos)
if not ds:
self.buffer.seek(pos)
self._snapshot = None
self._pending = ""
self._decoder = None
return pos
decoder = pickle.loads(ds)
self.buffer.seek(pos)
self._snapshot = (pos, ds, "")
self._pending = ""
self._decoder = None
return orig_pos
def read(self, n: int = -1):
decoder = self._decoder or self._get_decoder()
res = self._pending
if n < 0:
res += decoder.decode(self.buffer.read(), True)
self._pending = ""
self._snapshot = None
return res
else:
while len(res) < n:
data = self.buffer.read(64)
data = self._read_chunk()
res += decoder.decode(data, not data)
if not data:
break
......@@ -923,7 +1043,7 @@ class TextIOWrapper(TextIOBase):
def readline(self, limit=None):
if limit is not None:
# XXX Hack to support limit arg
# XXX Hack to support limit argument, for backwards compatibility
line = self.readline()
if len(line) <= limit:
return line
......@@ -951,7 +1071,7 @@ class TextIOWrapper(TextIOBase):
# We've seen \r - is it standalone, \r\n or \r at end of line?
if endpos + 1 < len(line):
if line[endpos+1] == '\n':
if line[endpos+1] == "\n":
ending = "\r\n"
else:
ending = "\r"
......@@ -963,7 +1083,7 @@ class TextIOWrapper(TextIOBase):
# No line ending seen yet - get more data
while True:
data = self.buffer.read(64)
data = self._read_chunk()
more_line = decoder.decode(data, not data)
if more_line or not data:
break
......
......@@ -93,6 +93,32 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.truncate(12), 12)
self.assertEqual(f.tell(), 12)
def read_ops(self, f, buffered=False):
data = f.read(5)
self.assertEqual(data, b"hello")
self.assertEqual(f.readinto(data), 5)
self.assertEqual(data, b" worl")
self.assertEqual(f.readinto(data), 2)
self.assertEqual(len(data), 5)
self.assertEqual(data[:2], b"d\n")
self.assertEqual(f.seek(0), 0)
self.assertEqual(f.read(20), b"hello world\n")
self.assertEqual(f.read(1), b"")
self.assertEqual(f.readinto(b"x"), 0)
self.assertEqual(f.seek(-6, 2), 6)
self.assertEqual(f.read(5), b"world")
self.assertEqual(f.read(0), b"")
self.assertEqual(f.readinto(b""), 0)
self.assertEqual(f.seek(-6, 1), 5)
self.assertEqual(f.read(5), b" worl")
self.assertEqual(f.tell(), 10)
if buffered:
f.seek(0)
self.assertEqual(f.read(), b"hello world\n")
f.seek(6)
self.assertEqual(f.read(), b"world\n")
self.assertEqual(f.read(), b"")
LARGE = 2**31
def large_file_ops(self, f):
......@@ -112,24 +138,6 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.seek(-1, 2), self.LARGE)
self.assertEqual(f.read(2), b"x")
def read_ops(self, f):
data = f.read(5)
self.assertEqual(data, b"hello")
n = f.readinto(data)
self.assertEqual(n, 5)
self.assertEqual(data, b" worl")
n = f.readinto(data)
self.assertEqual(n, 2)
self.assertEqual(len(data), 5)
self.assertEqual(data[:2], b"d\n")
f.seek(0)
self.assertEqual(f.read(20), b"hello world\n")
f.seek(-6, 2)
self.assertEqual(f.read(5), b"world")
f.seek(-6, 1)
self.assertEqual(f.read(5), b" worl")
self.assertEqual(f.tell(), 10)
def test_raw_file_io(self):
f = io.open(test_support.TESTFN, "wb", buffering=0)
self.assertEqual(f.readable(), False)
......@@ -155,7 +163,7 @@ class IOTest(unittest.TestCase):
self.assertEqual(f.readable(), True)
self.assertEqual(f.writable(), False)
self.assertEqual(f.seekable(), True)
self.read_ops(f)
self.read_ops(f, True)
f.close()
def test_raw_bytes_io(self):
......@@ -164,7 +172,7 @@ class IOTest(unittest.TestCase):
data = f.getvalue()
self.assertEqual(data, b"hello world\n")
f = io.BytesIO(data)
self.read_ops(f)
self.read_ops(f, True)
def test_large_file_ops(self):
# On Windows and Mac OSX this test comsumes large resources; It takes
......@@ -445,6 +453,10 @@ class BufferedRandomTest(unittest.TestCase):
class TextIOWrapperTest(unittest.TestCase):
## def tearDown(self):
## test_support.unlink(test_support.TESTFN)
def testNewlines(self):
input_lines = [ "unix\n", "windows\r\n", "os9\r", "last\n", "nonl" ]
......@@ -486,6 +498,62 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEquals(got_line, exp_line)
self.assertEquals(len(got_lines), len(exp_lines))
# Systematic tests of the text I/O API
def testBasicIO(self):
for chunksize in (1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65):
for enc in "ascii", "latin1", "utf8" :# , "utf-16-be", "utf-16-le":
f = io.open(test_support.TESTFN, "w+", encoding=enc)
f._CHUNK_SIZE = chunksize
self.assertEquals(f.write("abc"), 3)
f.close()
f = io.open(test_support.TESTFN, "r+", encoding=enc)
f._CHUNK_SIZE = chunksize
self.assertEquals(f.tell(), 0)
self.assertEquals(f.read(), "abc")
cookie = f.tell()
self.assertEquals(f.seek(0), 0)
self.assertEquals(f.read(2), "ab")
self.assertEquals(f.read(1), "c")
self.assertEquals(f.read(1), "")
self.assertEquals(f.read(), "")
self.assertEquals(f.tell(), cookie)
self.assertEquals(f.seek(0), 0)
self.assertEquals(f.seek(0, 2), cookie)
self.assertEquals(f.write("def"), 3)
self.assertEquals(f.seek(cookie), cookie)
self.assertEquals(f.read(), "def")
if enc.startswith("utf"):
self.multi_line_test(f, enc)
f.close()
def multi_line_test(self, f, enc):
f.seek(0)
f.truncate()
sample = u"s\xff\u0fff\uffff"
wlines = []
for size in (0, 1, 2, 3, 4, 5, 15, 16, 17, 31, 32, 33, 63, 64, 65,
100, 200, 300, 400, 500, 1000):
chars = []
for i in xrange(size):
chars.append(sample[i % len(sample)])
line = u"".join(chars) + "\n"
wlines.append((f.tell(), line))
f.write(line)
wendpos = f.tell()
f.seek(0)
rlines = []
while True:
pos = f.tell()
line = f.readline()
if not line:
rendpos = pos
break
rlines.append((pos, line))
self.assertEquals(rendpos, wendpos)
self.assertEquals(rlines, wlines)
# XXX Tests for open()
def test_main():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment