Commit 7df55dad authored by Victor Stinner's avatar Victor Stinner

Issue #6268: More bugfixes about BOM, UTF-16 and UTF-32

 * Fix seek() method of codecs.open(), don't write the BOM twice after seek(0)
 * Fix reset() method of codecs, UTF-16, UTF-32 and StreamWriter classes
 * test_codecs: use "w+" mode instead of "wt+". "t" mode is not supported by
   Solaris or Windows, but does it really exist? I found it the in the issue.
parent 54d2898e
...@@ -370,6 +370,11 @@ class StreamWriter(Codec): ...@@ -370,6 +370,11 @@ class StreamWriter(Codec):
""" """
pass pass
def seek(self, offset, whence=0):
self.stream.seek(offset, whence)
if whence == 0 and offset == 0:
self.reset()
def __getattr__(self, name, def __getattr__(self, name,
getattr=getattr): getattr=getattr):
...@@ -601,8 +606,8 @@ class StreamReader(Codec): ...@@ -601,8 +606,8 @@ class StreamReader(Codec):
Resets the codec buffers used for keeping state. Resets the codec buffers used for keeping state.
""" """
self.reset()
self.stream.seek(offset, whence) self.stream.seek(offset, whence)
self.reset()
def next(self): def next(self):
...@@ -695,8 +700,10 @@ class StreamReaderWriter: ...@@ -695,8 +700,10 @@ class StreamReaderWriter:
self.writer.reset() self.writer.reset()
def seek(self, offset, whence=0): def seek(self, offset, whence=0):
self.reader.seek(offset, whence) self.stream.seek(offset, whence)
self.writer.seek(offset, whence) self.reader.reset()
if whence == 0 and offset == 0:
self.writer.reset()
def __getattr__(self, name, def __getattr__(self, name,
getattr=getattr): getattr=getattr):
......
...@@ -58,17 +58,23 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): ...@@ -58,17 +58,23 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
class StreamWriter(codecs.StreamWriter): class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'): def __init__(self, stream, errors='strict'):
self.bom_written = False
codecs.StreamWriter.__init__(self, stream, errors) codecs.StreamWriter.__init__(self, stream, errors)
self.encoder = None
def reset(self):
codecs.StreamWriter.reset(self)
self.encoder = None
def encode(self, input, errors='strict'): def encode(self, input, errors='strict'):
self.bom_written = True if self.encoder is None:
result = codecs.utf_16_encode(input, errors) result = codecs.utf_16encoder(input, errors)
if sys.byteorder == 'little': if sys.byteorder == 'little':
self.encode = codecs.utf_16_le_encode self.encoder = codecs.utf_16_leencoder
else:
self.encoder = codecs.utf_16_beencoder
return result
else: else:
self.encode = codecs.utf_16_be_encode return self.encoder(input, errors)
return result
class StreamReader(codecs.StreamReader): class StreamReader(codecs.StreamReader):
......
...@@ -98,17 +98,23 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder): ...@@ -98,17 +98,23 @@ class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
class StreamWriter(codecs.StreamWriter): class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'): def __init__(self, stream, errors='strict'):
self.bom_written = False self.encoder = None
codecs.StreamWriter.__init__(self, stream, errors) codecs.StreamWriter.__init__(self, stream, errors)
def reset(self):
codecs.StreamWriter.reset(self)
self.encoder = None
def encode(self, input, errors='strict'): def encode(self, input, errors='strict'):
self.bom_written = True if self.encoder is None:
result = codecs.utf_32_encode(input, errors) result = codecs.utf_32encoder(input, errors)
if sys.byteorder == 'little': if sys.byteorder == 'little':
self.encode = codecs.utf_32_le_encode self.encoder = codecs.utf_32_leencoder
else:
self.encoder = codecs.utf_32_beencoder
return result
else: else:
self.encode = codecs.utf_32_be_encode return self.encoder(input, errors)
return result
class StreamReader(codecs.StreamReader): class StreamReader(codecs.StreamReader):
......
...@@ -1498,7 +1498,7 @@ class WithStmtTest(unittest.TestCase): ...@@ -1498,7 +1498,7 @@ class WithStmtTest(unittest.TestCase):
class BomTest(unittest.TestCase): class BomTest(unittest.TestCase):
def test_seek0(self): def test_seek0(self):
data = "1234567890" data = u"1234567890"
tests = ("utf-16", tests = ("utf-16",
"utf-16-le", "utf-16-le",
"utf-16-be", "utf-16-be",
...@@ -1506,8 +1506,8 @@ class BomTest(unittest.TestCase): ...@@ -1506,8 +1506,8 @@ class BomTest(unittest.TestCase):
"utf-32-le", "utf-32-le",
"utf-32-be") "utf-32-be")
for encoding in tests: for encoding in tests:
with codecs.open('foo', 'wt+', encoding=encoding) as f: # Check if the BOM is written only once
# Check if the BOM is written only once with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
f.write(data) f.write(data)
f.write(data) f.write(data)
f.seek(0) f.seek(0)
...@@ -1515,6 +1515,42 @@ class BomTest(unittest.TestCase): ...@@ -1515,6 +1515,42 @@ class BomTest(unittest.TestCase):
f.seek(0) f.seek(0)
self.assertEquals(f.read(), data * 2) self.assertEquals(f.read(), data * 2)
# Check that the BOM is written after a seek(0)
with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
f.write(data[0])
self.assertNotEquals(f.tell(), 0)
f.seek(0)
f.write(data)
f.seek(0)
self.assertEquals(f.read(), data)
# (StreamWriter) Check that the BOM is written after a seek(0)
with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
f.writer.write(data[0])
self.assertNotEquals(f.writer.tell(), 0)
f.writer.seek(0)
f.writer.write(data)
f.seek(0)
self.assertEquals(f.read(), data)
# Check that the BOM is not written after a seek() at a position
# different than the start
with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
f.write(data)
f.seek(f.tell())
f.write(data)
f.seek(0)
self.assertEquals(f.read(), data * 2)
# (StreamWriter) Check that the BOM is not written after a seek()
# at a position different than the start
with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
f.writer.write(data)
f.writer.seek(f.writer.tell())
f.writer.write(data)
f.seek(0)
self.assertEquals(f.read(), data * 2)
def test_main(): def test_main():
test_support.run_unittest( test_support.run_unittest(
......
...@@ -31,8 +31,9 @@ Library ...@@ -31,8 +31,9 @@ Library
- Issue #3924: Ignore cookies with invalid "version" field in cookielib. - Issue #3924: Ignore cookies with invalid "version" field in cookielib.
- Issue #6268: Fix seek() method of codecs.open(), don't read the BOM twice - Issue #6268: Fix seek() method of codecs.open(), don't read or write the BOM
after seek(0) twice after seek(0). Fix also reset() method of codecs, UTF-16, UTF-32 and
StreamWriter classes.
- Issue #5640: Fix Shift-JIS incremental encoder for error handlers different - Issue #5640: Fix Shift-JIS incremental encoder for error handlers different
than strict than strict
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment