Commit b9410c8a authored by Walter Dörwald's avatar Walter Dörwald

Apply SF patch #1698994: Add getstate() and setstate()

methods to incrementalcodecs.

Also forward port r54786 (fix the incremental
utf_8_sig decoder).
parent bc6d5f8b
......@@ -405,6 +405,21 @@ define in order to be compatible with the Python codec registry.
Reset the encoder to the initial state.
Return the current state of the encoder which must be an integer.
The implementation should make sure that \code{0} is the most common state.
(States that are more complicated than integers can be converted into an
integer by marshaling/pickling the state and encoding the bytes of the
resulting string into an integer).
Set the state of the encoder to \var{state}. \var{state} must be an
encoder state returned by \method{getstate}.
\subsubsection{IncrementalDecoder Objects \label{incremental-decoder-objects}}
......@@ -453,6 +468,27 @@ define in order to be compatible with the Python codec registry.
Reset the decoder to the initial state.
Return the current state of the decoder. This must be a tuple with two
items, the first must be the buffer containing the still undecoded input.
The second must be an integer and can be additional state info.
(The implementation should make sure that \code{0} is the most common
additional state info.) If this additional state info is \code{0} it must
be possible to set the decoder to the state which has no input buffered
and \code{0} as the additional state info, so that feeding the previously
buffered input to the decoder returns it to the previous state without
producing any output. (Additional state info that is more complicated
than integers can be converted into an integer by marshaling/pickling
the info and encoding the bytes of the resulting string into an integer.)
Set the state of the encoder to \var{state}. \var{state} must be a
decoder state returned by \method{getstate}.
The \class{StreamWriter} and \class{StreamReader} classes provide
generic working interfaces which can be used to implement new
......@@ -87,7 +87,9 @@ class CodecInfo(tuple):
return self
def __repr__(self):
return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__,, id(self))
return "<%s.%s object for encoding %s at 0x%x>" % \
(self.__class__.__module__, self.__class__.__name__,, id(self))
class Codec:
......@@ -155,9 +157,9 @@ class Codec:
class IncrementalEncoder(object):
An IncrementalEncoder encodes an input in multiple steps. The input can be
passed piece by piece to the encode() method. The IncrementalEncoder remembers
the state of the Encoding process between calls to encode().
An IncrementalEncoder encodes an input in multiple steps. The input can
be passed piece by piece to the encode() method. The IncrementalEncoder
remembers the state of the encoding process between calls to encode().
def __init__(self, errors='strict'):
......@@ -181,6 +183,18 @@ class IncrementalEncoder(object):
Resets the encoder to the initial state.
def getstate(self):
Return the current state of the encoder.
return 0
def setstate(self, state):
Set the current state of the encoder. state must have been
returned by getstate().
class BufferedIncrementalEncoder(IncrementalEncoder):
This subclass of IncrementalEncoder can be used as the baseclass for an
......@@ -189,7 +203,8 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
def __init__(self, errors='strict'):
IncrementalEncoder.__init__(self, errors)
self.buffer = "" # unencoded input that is kept between calls to encode()
# unencoded input that is kept between calls to encode()
self.buffer = ""
def _buffer_encode(self, input, errors, final):
# Overwrite this method in subclasses: It must encode input
......@@ -208,10 +223,16 @@ class BufferedIncrementalEncoder(IncrementalEncoder):
self.buffer = ""
def getstate(self):
return self.buffer or 0
def setstate(self, state):
self.buffer = state or ""
class IncrementalDecoder(object):
An IncrementalDecoder decodes an input in multiple steps. The input can be
passed piece by piece to the decode() method. The IncrementalDecoder
An IncrementalDecoder decodes an input in multiple steps. The input can
be passed piece by piece to the decode() method. The IncrementalDecoder
remembers the state of the decoding process between calls to decode().
def __init__(self, errors='strict'):
......@@ -235,15 +256,29 @@ class IncrementalDecoder(object):
Resets the decoder to the initial state.
def getstate(self):
Return the current state of the decoder. This must be a
(buffered_input, additional_state_info) tuple.
return ("", 0)
def setstate(self, state):
Set the current state of the decoder. state must have been
returned by getstate().
class BufferedIncrementalDecoder(IncrementalDecoder):
This subclass of IncrementalDecoder can be used as the baseclass for an
incremental decoder if the decoder must be able to handle incomplete byte
incremental decoder if the decoder must be able to handle incomplete
byte sequences.
def __init__(self, errors='strict'):
IncrementalDecoder.__init__(self, errors)
self.buffer = "" # undecoded input that is kept between calls to decode()
# undecoded input that is kept between calls to decode()
self.buffer = ""
def _buffer_decode(self, input, errors, final):
# Overwrite this method in subclasses: It must decode input
......@@ -262,6 +297,14 @@ class BufferedIncrementalDecoder(IncrementalDecoder):
self.buffer = ""
def getstate(self):
# additional state info is always 0
return (self.buffer, 0)
def setstate(self, state):
# ignore additional state info
self.buffer = state[0]
# The StreamWriter and StreamReader class provide generic working
# interfaces which can be used to implement new encoding submodules
......@@ -424,7 +467,8 @@ class StreamReader(Codec):
newchars, decodedbytes = self.decode(data, self.errors)
except UnicodeDecodeError as exc:
if firstline:
newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
newchars, decodedbytes = \
self.decode(data[:exc.start], self.errors)
lines = newchars.splitlines(True)
if len(lines)<=1:
......@@ -34,6 +34,22 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
self.encoder = None
def getstate(self):
# state info we return to the caller:
# 0: stream is in natural order for this platform
# 2: endianness hasn't been determined yet
# (we're never writing in unnatural order)
return (2 if self.encoder is None else 0)
def setstate(self, state):
if state:
self.encoder = None
if sys.byteorder == 'little':
self.encoder = codecs.utf_16_le_encode
self.encoder = codecs.utf_16_be_encode
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
......@@ -56,6 +72,35 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
self.decoder = None
def getstate(self):
# additonal state info from the base class must be None here,
# as it isn't passed along to the caller
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
# additional state info we pass to the caller:
# 0: stream is in natural order for this platform
# 1: stream is in unnatural order
# 2: endianness hasn't been determined yet
if self.decoder is None:
return (state, 2)
addstate = int((sys.byteorder == "big") !=
(self.decoder is codecs.utf_16_be_decode))
return (state, addstate)
def setstate(self, state):
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
codecs.BufferedIncrementalDecoder.setstate(self, state)
state = state[1]
if state == 0:
self.decoder = (codecs.utf_16_be_decode
if sys.byteorder == "big"
else codecs.utf_16_le_decode)
elif state == 1:
self.decoder = (codecs.utf_16_le_decode
if sys.byteorder == "big"
else codecs.utf_16_be_decode)
self.decoder = None
class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
self.bom_written = False
......@@ -12,7 +12,8 @@ import codecs
### Codec APIs
def encode(input, errors='strict'):
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
def decode(input, errors='strict'):
prefix = 0
......@@ -25,38 +26,61 @@ def decode(input, errors='strict'):
class IncrementalEncoder(codecs.IncrementalEncoder):
def __init__(self, errors='strict'):
codecs.IncrementalEncoder.__init__(self, errors)
self.first = True
self.first = 1
def encode(self, input, final=False):
if self.first:
self.first = False
return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
self.first = 0
return codecs.BOM_UTF8 + \
codecs.utf_8_encode(input, self.errors)[0]
return codecs.utf_8_encode(input, self.errors)[0]
def reset(self):
self.first = True
self.first = 1
def getstate(self):
return self.first
def setstate(self, state):
self.first = state
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def __init__(self, errors='strict'):
codecs.BufferedIncrementalDecoder.__init__(self, errors)
self.first = True
self.first = 1
def _buffer_decode(self, input, errors, final):
if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM
if self.first:
if len(input) < 3:
# not enough data to decide if this really is a BOM
# => try again on the next call
return (u"", 0)
(output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
self.first = False
return (output, consumed+3)
if codecs.BOM_UTF8.startswith(input):
# not enough data to decide if this really is a BOM
# => try again on the next call
return (u"", 0)
self.first = 0
self.first = 0
if input[:3] == codecs.BOM_UTF8:
(output, consumed) = \
codecs.utf_8_decode(input[3:], errors, final)
return (output, consumed+3)
return codecs.utf_8_decode(input, errors, final)
def reset(self):
self.first = True
self.first = 1
def getstate(self):
state = codecs.BufferedIncrementalDecoder.getstate(self)
# state[1] must be 0 here, as it isn't passed along to the caller
return (state[0], self.first)
def setstate(self, state):
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
codecs.BufferedIncrementalDecoder.setstate(self, state)
self.first = state[1]
class StreamWriter(codecs.StreamWriter):
def reset(self):
......@@ -23,7 +23,40 @@ class Queue(object):
self._buffer = self._buffer[size:]
return s
class ReadTest(unittest.TestCase):
class MixInCheckStateHandling:
def check_state_handling_decode(self, encoding, u, s):
for i in xrange(len(s)+1):
d = codecs.getincrementaldecoder(encoding)()
part1 = d.decode(s[:i])
state = d.getstate()
self.assert_(isinstance(state[1], int))
# Check that the condition stated in the documentation for
# IncrementalDecoder.getstate() holds
if not state[1]:
# reset decoder to the default state without anything buffered
d.setstate((state[0][:0], 0))
# Feeding the previous input may not produce any output
self.assert_(not d.decode(state[0]))
# The decoder must return to the same state
self.assertEqual(state, d.getstate())
# Create a new decoder and set it to the state
# we extracted from the old one
d = codecs.getincrementaldecoder(encoding)()
part2 = d.decode(s[i:], True)
self.assertEqual(u, part1+part2)
def check_state_handling_encode(self, encoding, u, s):
for i in xrange(len(u)+1):
d = codecs.getincrementalencoder(encoding)()
part1 = d.encode(u[:i])
state = d.getstate()
d = codecs.getincrementalencoder(encoding)()
part2 = d.encode(u[i:], True)
self.assertEqual(s, part1+part2)
class ReadTest(unittest.TestCase, MixInCheckStateHandling):
def check_partial(self, input, partialresults):
# get a StreamReader for the encoding and feed the bytestring version
# of input to the reader byte by byte. Read every available from
......@@ -292,7 +325,14 @@ class UTF16Test(ReadTest):
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
"\xff", "strict", True)
def test_decoder_state(self):
u"spamspam", self.spamle)
u"spamspam", self.spambe)
class UTF16LETest(ReadTest):
encoding = "utf-16-le"
......@@ -313,7 +353,8 @@ class UTF16LETest(ReadTest):
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
"\xff", "strict", True)
class UTF16BETest(ReadTest):
encoding = "utf-16-be"
......@@ -334,7 +375,8 @@ class UTF16BETest(ReadTest):
def test_errors(self):
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
"\xff", "strict", True)
class UTF8Test(ReadTest):
encoding = "utf-8"
......@@ -357,6 +399,11 @@ class UTF8Test(ReadTest):
def test_decoder_state(self):
u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
u, u.encode(self.encoding))
class UTF7Test(ReadTest):
encoding = "utf-7"
......@@ -429,6 +476,16 @@ class UTF8SigTest(ReadTest):
# SF bug #1601501: check that the codec works with a buffer
unicode("\xef\xbb\xbf", "utf-8-sig")
def test_bom(self):
d = codecs.getincrementaldecoder("utf-8-sig")()
s = u"spam"
self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
def test_decoder_state(self):
u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
u, u.encode(self.encoding))
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEquals(codecs.escape_decode(""), ("", 0))
......@@ -1066,7 +1123,11 @@ broken_unicode_with_streams = [
broken_incremental_coders = broken_unicode_with_streams[:]
broken_incremental_coders = broken_unicode_with_streams + [
# The following encodings only support "strict" mode
only_strict_mode = [
......@@ -1091,7 +1152,7 @@ else:
class BasicUnicodeTest(unittest.TestCase):
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = u"abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings:
......@@ -1215,6 +1276,14 @@ class BasicUnicodeTest(unittest.TestCase):
table_type = type(cp1140.encoding_table)
self.assertEqual(table_type, table_type)
def test_decoder_state(self):
# Check that getstate() and setstate() handle the state properly
u = u"abc123"
for encoding in all_unicode_encodings:
if encoding not in broken_incremental_coders:
self.check_state_handling_decode(encoding, u, u.encode(encoding))
self.check_state_handling_encode(encoding, u, u.encode(encoding))
class BasicStrTest(unittest.TestCase):
def test_basics(self):
s = "abc123"
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment