Commit 04140388 authored by Guido van Rossum's avatar Guido van Rossum

Instead of pickling the whole decoder, use the new getstate/setstate API.

parent b9410c8a
...@@ -18,7 +18,7 @@ XXX don't use assert to validate input requirements ...@@ -18,7 +18,7 @@ XXX don't use assert to validate input requirements
XXX whenever an argument is None, use the default value XXX whenever an argument is None, use the default value
XXX read/write ops should check readable/writable XXX read/write ops should check readable/writable
XXX buffered readinto should work with arbitrary buffer objects XXX buffered readinto should work with arbitrary buffer objects
XXX use incremental encoder for text output, at least for UTF-16 XXX use incremental encoder for text output, at least for UTF-16 and UTF-8-SIG
""" """
__author__ = ("Guido van Rossum <guido@python.org>, " __author__ = ("Guido van Rossum <guido@python.org>, "
...@@ -36,11 +36,6 @@ import codecs ...@@ -36,11 +36,6 @@ import codecs
import _fileio import _fileio
import warnings import warnings
try:
import cPickle as pickle
except ImportError:
import pickle
# XXX Shouldn't we use st_blksize whenever we can? # XXX Shouldn't we use st_blksize whenever we can?
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
...@@ -957,17 +952,16 @@ class TextIOWrapper(TextIOBase): ...@@ -957,17 +952,16 @@ class TextIOWrapper(TextIOBase):
self._newline = newline or os.linesep self._newline = newline or os.linesep
self._fix_newlines = newline is None self._fix_newlines = newline is None
self._decoder = None self._decoder = None
self._decoder_in_rest_pickle = None
self._pending = "" self._pending = ""
self._snapshot = None self._snapshot = None
self._seekable = self._telling = self.buffer.seekable() self._seekable = self._telling = self.buffer.seekable()
# A word about _snapshot. This attribute is either None, or a # A word about _snapshot. This attribute is either None, or a
# tuple (decoder_pickle, readahead, pending) where decoder_pickle # tuple (decoder_state, readahead, pending) where decoder_state is
# is a pickled decoder state, readahead is the chunk of bytes that # the second (integer) item of the decoder state, readahead is the
# was read, and pending is the characters that were rendered by # chunk of bytes that was read, and pending is the characters that
# the decoder after feeding it those bytes. We use this to # were rendered by the decoder after feeding it those bytes. We
# reconstruct intermediate decoder states in tell(). # use this to reconstruct intermediate decoder states in tell().
def _seekable(self): def _seekable(self):
return self._seekable return self._seekable
...@@ -1005,10 +999,6 @@ class TextIOWrapper(TextIOBase): ...@@ -1005,10 +999,6 @@ class TextIOWrapper(TextIOBase):
raise IOError("Can't find an incremental decoder for encoding %s" % raise IOError("Can't find an incremental decoder for encoding %s" %
self._encoding) self._encoding)
decoder = self._decoder = make_decoder() # XXX: errors decoder = self._decoder = make_decoder() # XXX: errors
if isinstance(decoder, codecs.BufferedIncrementalDecoder):
# XXX Hack: make the codec use bytes instead of strings
decoder.buffer = b""
self._decoder_in_rest_pickle = pickle.dumps(decoder, 2) # For tell()
return decoder return decoder
def _read_chunk(self): def _read_chunk(self):
...@@ -1017,15 +1007,13 @@ class TextIOWrapper(TextIOBase): ...@@ -1017,15 +1007,13 @@ class TextIOWrapper(TextIOBase):
readahead = self.buffer.read1(self._CHUNK_SIZE) readahead = self.buffer.read1(self._CHUNK_SIZE)
pending = self._decoder.decode(readahead, not readahead) pending = self._decoder.decode(readahead, not readahead)
return readahead, pending return readahead, pending
decoder_state = pickle.dumps(self._decoder, 2) decoder_buffer, decoder_state = self._decoder.getstate()
readahead = self.buffer.read1(self._CHUNK_SIZE) readahead = self.buffer.read1(self._CHUNK_SIZE)
pending = self._decoder.decode(readahead, not readahead) pending = self._decoder.decode(readahead, not readahead)
self._snapshot = (decoder_state, readahead, pending) self._snapshot = (decoder_state, decoder_buffer + readahead, pending)
return readahead, pending return readahead, pending
def _encode_decoder_state(self, ds, pos): def _encode_decoder_state(self, ds, pos):
if ds == self._decoder_in_rest_pickle:
return pos
x = 0 x = 0
for i in bytes(ds): for i in bytes(ds):
x = x<<8 | i x = x<<8 | i
...@@ -1048,7 +1036,8 @@ class TextIOWrapper(TextIOBase): ...@@ -1048,7 +1036,8 @@ class TextIOWrapper(TextIOBase):
raise IOError("Telling position disabled by next() call") raise IOError("Telling position disabled by next() call")
self.flush() self.flush()
position = self.buffer.tell() position = self.buffer.tell()
if self._decoder is None or self._snapshot is None: decoder = self._decoder
if decoder is None or self._snapshot is None:
assert self._pending == "" assert self._pending == ""
return position return position
decoder_state, readahead, pending = self._snapshot decoder_state, readahead, pending = self._snapshot
...@@ -1056,15 +1045,21 @@ class TextIOWrapper(TextIOBase): ...@@ -1056,15 +1045,21 @@ class TextIOWrapper(TextIOBase):
needed = len(pending) - len(self._pending) needed = len(pending) - len(self._pending)
if not needed: if not needed:
return self._encode_decoder_state(decoder_state, position) return self._encode_decoder_state(decoder_state, position)
decoder = pickle.loads(decoder_state) saved_state = decoder.getstate()
n = 0 try:
bb = bytes(1) decoder.setstate(("", decoder_state))
for i, bb[0] in enumerate(readahead): n = 0
n += len(decoder.decode(bb)) bb = bytes(1)
if n >= needed: for i, bb[0] in enumerate(readahead):
decoder_state = pickle.dumps(decoder, 2) n += len(decoder.decode(bb))
return self._encode_decoder_state(decoder_state, position+i+1) if n >= needed:
raise IOError("Can't reconstruct logical file position") decoder_buffer, decoder_state = decoder.getstate()
return self._encode_decoder_state(
decoder_state,
position + (i+1) - len(decoder_buffer))
raise IOError("Can't reconstruct logical file position")
finally:
decoder.setstate(saved_state)
def seek(self, pos, whence=0): def seek(self, pos, whence=0):
if not self._seekable: if not self._seekable:
...@@ -1097,12 +1092,11 @@ class TextIOWrapper(TextIOBase): ...@@ -1097,12 +1092,11 @@ class TextIOWrapper(TextIOBase):
self._pending = "" self._pending = ""
self._decoder = None self._decoder = None
return pos return pos
decoder = pickle.loads(ds) decoder = self._decoder or self._get_decoder()
decoder.set_state(("", ds))
self.buffer.seek(pos) self.buffer.seek(pos)
self._snapshot = (ds, b"", "") self._snapshot = (ds, b"", "")
self._pending = "" self._pending = ""
if not self._decoder_in_rest_pickle:
self._get_decoder() # For its side effect
self._decoder = decoder self._decoder = decoder
return orig_pos return orig_pos
......
...@@ -581,6 +581,36 @@ class TextIOWrapperTest(unittest.TestCase): ...@@ -581,6 +581,36 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEquals(f.tell(), p2) self.assertEquals(f.tell(), p2)
f.close() f.close()
def testSeeking(self):
chunk_size = io.TextIOWrapper._CHUNK_SIZE
prefix_size = chunk_size - 2
u_prefix = u"a" * prefix_size
prefix = bytes(u_prefix.encode("utf-8"))
self.assertEquals(len(u_prefix), len(prefix))
u_suffix = u"\u8888\n"
suffix = bytes(u_suffix.encode("utf-8"))
line = prefix + suffix
f = io.open(test_support.TESTFN, "wb")
f.write(line*2)
f.close()
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
s = f.read(prefix_size)
self.assertEquals(s, prefix)
self.assertEquals(f.tell(), prefix_size)
self.assertEquals(f.readline(), u_suffix)
def testSeekingToo(self):
# Regression test for a specific bug
data = b'\xe0\xbf\xbf\n'
f = io.open(test_support.TESTFN, "wb")
f.write(data)
f.close()
f = io.open(test_support.TESTFN, "r", encoding="utf-8")
f._CHUNK_SIZE # Just test that it exists
f._CHUNK_SIZE = 2
f.readline()
f.tell()
def timingTest(self): def timingTest(self):
timer = time.time timer = time.time
enc = "utf8" enc = "utf8"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment