Commit e450185b authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.

This means, for example, that opening an UTF-16 text file in
append mode doesn't add a BOM at the end of the file if the file isn't
empty.
parent b565577a
...@@ -1436,6 +1436,15 @@ class TextIOWrapper(TextIOBase): ...@@ -1436,6 +1436,15 @@ class TextIOWrapper(TextIOBase):
self._snapshot = None # info for reconstructing decoder state self._snapshot = None # info for reconstructing decoder state
self._seekable = self._telling = self.buffer.seekable() self._seekable = self._telling = self.buffer.seekable()
if self._seekable and self.writable():
position = self.buffer.tell()
if position != 0:
try:
self._get_encoder().setstate(0)
except LookupError:
# Sometimes the encoder doesn't exist
pass
# self._snapshot is either None, or a tuple (dec_flags, next_input) # self._snapshot is either None, or a tuple (dec_flags, next_input)
# where dec_flags is the second (integer) item of the decoder state # where dec_flags is the second (integer) item of the decoder state
# and next_input is the chunk of input bytes that comes next after the # and next_input is the chunk of input bytes that comes next after the
...@@ -1741,6 +1750,17 @@ class TextIOWrapper(TextIOBase): ...@@ -1741,6 +1750,17 @@ class TextIOWrapper(TextIOBase):
raise IOError("can't restore logical file position") raise IOError("can't restore logical file position")
self._decoded_chars_used = chars_to_skip self._decoded_chars_used = chars_to_skip
# Finally, reset the encoder (merely useful for proper BOM handling)
try:
encoder = self._encoder or self._get_encoder()
except LookupError:
# Sometimes the encoder doesn't exist
pass
else:
if cookie != 0:
encoder.setstate(0)
else:
encoder.reset()
return cookie return cookie
def read(self, n=None): def read(self, n=None):
......
...@@ -1963,6 +1963,37 @@ class TextIOWrapperTest(unittest.TestCase): ...@@ -1963,6 +1963,37 @@ class TextIOWrapperTest(unittest.TestCase):
self.assertEqual(buffer.seekable(), txt.seekable()) self.assertEqual(buffer.seekable(), txt.seekable())
def test_append_bom(self):
# The BOM is not written again when appending to a non-empty file
filename = support.TESTFN
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
with self.open(filename, 'w', encoding=charset) as f:
f.write('aaa')
pos = f.tell()
with self.open(filename, 'rb') as f:
self.assertEquals(f.read(), 'aaa'.encode(charset))
with self.open(filename, 'a', encoding=charset) as f:
f.write('xxx')
with self.open(filename, 'rb') as f:
self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
def test_seek_bom(self):
# Same test, but when seeking manually
filename = support.TESTFN
for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
with self.open(filename, 'w', encoding=charset) as f:
f.write('aaa')
pos = f.tell()
with self.open(filename, 'r+', encoding=charset) as f:
f.seek(pos)
f.write('zzz')
f.seek(0)
f.write('bbb')
with self.open(filename, 'rb') as f:
self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
class CTextIOWrapperTest(TextIOWrapperTest): class CTextIOWrapperTest(TextIOWrapperTest):
def test_initialization(self): def test_initialization(self):
......
...@@ -23,6 +23,11 @@ Core and Builtins ...@@ -23,6 +23,11 @@ Core and Builtins
Library Library
------- -------
- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
library. This means, for example, that opening an UTF-16 text file in
append mode doesn't add a BOM at the end of the file if the file isn't
empty.
- Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source' - Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source'
file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix file is a binary. Patch by Brodie Rao, tests by Daniel Diniz. This fix
corrects a pydoc regression. corrects a pydoc regression.
......
...@@ -41,6 +41,7 @@ PyObject *_PyIO_str_readline; ...@@ -41,6 +41,7 @@ PyObject *_PyIO_str_readline;
PyObject *_PyIO_str_reset; PyObject *_PyIO_str_reset;
PyObject *_PyIO_str_seek; PyObject *_PyIO_str_seek;
PyObject *_PyIO_str_seekable; PyObject *_PyIO_str_seekable;
PyObject *_PyIO_str_setstate;
PyObject *_PyIO_str_tell; PyObject *_PyIO_str_tell;
PyObject *_PyIO_str_truncate; PyObject *_PyIO_str_truncate;
PyObject *_PyIO_str_writable; PyObject *_PyIO_str_writable;
...@@ -48,6 +49,7 @@ PyObject *_PyIO_str_write; ...@@ -48,6 +49,7 @@ PyObject *_PyIO_str_write;
PyObject *_PyIO_empty_str; PyObject *_PyIO_empty_str;
PyObject *_PyIO_empty_bytes; PyObject *_PyIO_empty_bytes;
PyObject *_PyIO_zero;
PyDoc_STRVAR(module_doc, PyDoc_STRVAR(module_doc,
...@@ -734,6 +736,8 @@ PyInit__io(void) ...@@ -734,6 +736,8 @@ PyInit__io(void)
goto fail; goto fail;
if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable"))) if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
goto fail; goto fail;
if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
goto fail;
if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell"))) if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
goto fail; goto fail;
if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate"))) if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
...@@ -747,6 +751,8 @@ PyInit__io(void) ...@@ -747,6 +751,8 @@ PyInit__io(void)
goto fail; goto fail;
if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0))) if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
goto fail; goto fail;
if (!(_PyIO_zero = PyLong_FromLong(0L)))
goto fail;
state->initialized = 1; state->initialized = 1;
......
...@@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_readline; ...@@ -141,6 +141,7 @@ extern PyObject *_PyIO_str_readline;
extern PyObject *_PyIO_str_reset; extern PyObject *_PyIO_str_reset;
extern PyObject *_PyIO_str_seek; extern PyObject *_PyIO_str_seek;
extern PyObject *_PyIO_str_seekable; extern PyObject *_PyIO_str_seekable;
extern PyObject *_PyIO_str_setstate;
extern PyObject *_PyIO_str_tell; extern PyObject *_PyIO_str_tell;
extern PyObject *_PyIO_str_truncate; extern PyObject *_PyIO_str_truncate;
extern PyObject *_PyIO_str_writable; extern PyObject *_PyIO_str_writable;
...@@ -148,3 +149,4 @@ extern PyObject *_PyIO_str_write; ...@@ -148,3 +149,4 @@ extern PyObject *_PyIO_str_write;
extern PyObject *_PyIO_empty_str; extern PyObject *_PyIO_empty_str;
extern PyObject *_PyIO_empty_bytes; extern PyObject *_PyIO_empty_bytes;
extern PyObject *_PyIO_zero;
...@@ -647,6 +647,8 @@ typedef struct ...@@ -647,6 +647,8 @@ typedef struct
char telling; char telling;
/* Specialized encoding func (see below) */ /* Specialized encoding func (see below) */
encodefunc_t encodefunc; encodefunc_t encodefunc;
/* Whether or not it's the start of the stream */
char encoding_start_of_stream;
/* Reads and writes are internally buffered in order to speed things up. /* Reads and writes are internally buffered in order to speed things up.
However, any read will first flush the write buffer if itsn't empty. However, any read will first flush the write buffer if itsn't empty.
...@@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text) ...@@ -707,21 +709,50 @@ utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
static PyObject * static PyObject *
utf16_encode(PyTextIOWrapperObject *self, PyObject *text) utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
{ {
PyObject *res; if (!self->encoding_start_of_stream) {
res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text), /* Skip the BOM and use native byte ordering */
PyUnicode_GET_SIZE(text),
PyBytes_AS_STRING(self->errors), 0);
if (res == NULL)
return NULL;
/* Next writes will skip the BOM and use native byte ordering */
#if defined(WORDS_BIGENDIAN) #if defined(WORDS_BIGENDIAN)
self->encodefunc = (encodefunc_t) utf16be_encode; return utf16be_encode(self, text);
#else #else
self->encodefunc = (encodefunc_t) utf16le_encode; return utf16le_encode(self, text);
#endif #endif
return res; }
return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
PyUnicode_GET_SIZE(text),
PyBytes_AS_STRING(self->errors), 0);
} }
static PyObject *
utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
{
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
PyUnicode_GET_SIZE(text),
PyBytes_AS_STRING(self->errors), 1);
}
static PyObject *
utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
{
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
PyUnicode_GET_SIZE(text),
PyBytes_AS_STRING(self->errors), -1);
}
static PyObject *
utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
{
if (!self->encoding_start_of_stream) {
/* Skip the BOM and use native byte ordering */
#if defined(WORDS_BIGENDIAN)
return utf32be_encode(self, text);
#else
return utf32le_encode(self, text);
#endif
}
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
PyUnicode_GET_SIZE(text),
PyBytes_AS_STRING(self->errors), 0);
}
static PyObject * static PyObject *
utf8_encode(PyTextIOWrapperObject *self, PyObject *text) utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
...@@ -749,10 +780,13 @@ typedef struct { ...@@ -749,10 +780,13 @@ typedef struct {
static encodefuncentry encodefuncs[] = { static encodefuncentry encodefuncs[] = {
{"ascii", (encodefunc_t) ascii_encode}, {"ascii", (encodefunc_t) ascii_encode},
{"iso8859-1", (encodefunc_t) latin1_encode}, {"iso8859-1", (encodefunc_t) latin1_encode},
{"utf-8", (encodefunc_t) utf8_encode},
{"utf-16-be", (encodefunc_t) utf16be_encode}, {"utf-16-be", (encodefunc_t) utf16be_encode},
{"utf-16-le", (encodefunc_t) utf16le_encode}, {"utf-16-le", (encodefunc_t) utf16le_encode},
{"utf-16", (encodefunc_t) utf16_encode}, {"utf-16", (encodefunc_t) utf16_encode},
{"utf-8", (encodefunc_t) utf8_encode}, {"utf-32-be", (encodefunc_t) utf32be_encode},
{"utf-32-le", (encodefunc_t) utf32le_encode},
{"utf-32", (encodefunc_t) utf32_encode},
{NULL, NULL} {NULL, NULL}
}; };
...@@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds) ...@@ -978,6 +1012,33 @@ TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
self->seekable = self->telling = PyObject_IsTrue(res); self->seekable = self->telling = PyObject_IsTrue(res);
Py_DECREF(res); Py_DECREF(res);
self->encoding_start_of_stream = 0;
if (self->seekable && self->encoder) {
PyObject *cookieObj;
int cmp;
self->encoding_start_of_stream = 1;
cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
if (cookieObj == NULL)
goto error;
cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
Py_DECREF(cookieObj);
if (cmp < 0) {
goto error;
}
if (cmp == 0) {
self->encoding_start_of_stream = 0;
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
_PyIO_zero, NULL);
if (res == NULL)
goto error;
Py_DECREF(res);
}
}
self->ok = 1; self->ok = 1;
return 0; return 0;
...@@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args) ...@@ -1192,8 +1253,10 @@ TextIOWrapper_write(PyTextIOWrapperObject *self, PyObject *args)
needflush = 1; needflush = 1;
/* XXX What if we were just reading? */ /* XXX What if we were just reading? */
if (self->encodefunc != NULL) if (self->encodefunc != NULL) {
b = (*self->encodefunc)((PyObject *) self, text); b = (*self->encodefunc)((PyObject *) self, text);
self->encoding_start_of_stream = 0;
}
else else
b = PyObject_CallMethodObjArgs(self->encoder, b = PyObject_CallMethodObjArgs(self->encoder,
_PyIO_str_encode, text, NULL); _PyIO_str_encode, text, NULL);
...@@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self, ...@@ -1847,24 +1910,38 @@ _TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
return 0; return 0;
} }
static int
_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
CookieStruct *cookie)
{
PyObject *res;
/* Same as _TextIOWrapper_decoder_setstate() above. */
if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
self->encoding_start_of_stream = 1;
}
else {
res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
_PyIO_zero, NULL);
self->encoding_start_of_stream = 0;
}
if (res == NULL)
return -1;
Py_DECREF(res);
return 0;
}
static PyObject * static PyObject *
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
{ {
PyObject *cookieObj, *posobj; PyObject *cookieObj, *posobj;
CookieStruct cookie; CookieStruct cookie;
int whence = 0; int whence = 0;
static PyObject *zero = NULL;
PyObject *res; PyObject *res;
int cmp; int cmp;
CHECK_INITIALIZED(self); CHECK_INITIALIZED(self);
if (zero == NULL) {
zero = PyLong_FromLong(0L);
if (zero == NULL)
return NULL;
}
if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence)) if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
return NULL; return NULL;
CHECK_CLOSED(self); CHECK_CLOSED(self);
...@@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) ...@@ -1879,7 +1956,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
if (whence == 1) { if (whence == 1) {
/* seek relative to current position */ /* seek relative to current position */
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
if (cmp < 0) if (cmp < 0)
goto fail; goto fail;
...@@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) ...@@ -1900,7 +1977,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
else if (whence == 2) { else if (whence == 2) {
/* seek relative to end of file */ /* seek relative to end of file */
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ); cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
if (cmp < 0) if (cmp < 0)
goto fail; goto fail;
...@@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) ...@@ -1934,7 +2011,7 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
goto fail; goto fail;
} }
cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT); cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
if (cmp < 0) if (cmp < 0)
goto fail; goto fail;
...@@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args) ...@@ -2013,6 +2090,11 @@ TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
goto fail; goto fail;
} }
/* Finally, reset the encoder (merely useful for proper BOM handling) */
if (self->encoder) {
if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
goto fail;
}
return cookieObj; return cookieObj;
fail: fail:
Py_XDECREF(cookieObj); Py_XDECREF(cookieObj);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment