Commit 489b56e0 authored by Marc-André Lemburg's avatar Marc-André Lemburg

This patch changes the behaviour of the UTF-16 codec family. Only the

UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub-
sequent BOM characters are no longer interpreted and removed.
UTF-16-LE and -BE pass through all BOM mark characters.

These changes should get the UTF-16 codec more in line with what
the Unicode FAQ recommends w/r to BOM marks.
parent f52d27e5
...@@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( ...@@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
*byteorder == 0: native order *byteorder == 0: native order
*byteorder == 1: big endian *byteorder == 1: big endian
and then switches according to all BOM marks it finds in the input In native mode, the first two bytes of the stream are checked for a
data. BOM marks are not copied into the resulting Unicode string. BOM mark. If found, the BOM mark is analysed, the byte order
After completion, *byteorder is set to the current byte order at adjusted and the BOM skipped. In the other modes, no BOM mark
the end of input data. interpretation is done. After completion, *byteorder is set to the
current byte order at the end of input data.
If byteorder is NULL, the codec starts in native order mode. If byteorder is NULL, the codec starts in native order mode.
......
...@@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, ...@@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
if (byteorder) if (byteorder)
bo = *byteorder; bo = *byteorder;
while (q < e) { /* Check for BOM marks (U+FEFF) in the input and adjust current
register Py_UNICODE ch = *q++; byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
/* Check for BOM marks (U+FEFF) in the input and adjust stream as-is (giving a ZWNBSP character). */
current byte order setting accordingly. Swap input if (bo == 0) {
bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
!) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN #ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (ch == 0xFEFF) { if (*q == 0xFEFF) {
q++;
bo = -1; bo = -1;
continue; } else if (*q == 0xFFFE) {
} else if (ch == 0xFFFE) { q++;
bo = 1; bo = 1;
continue;
} }
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else #else
if (ch == 0xFEFF) { if (*q == 0xFEFF) {
q++;
bo = 1; bo = 1;
continue; } else if (*q == 0xFFFE) {
} else if (ch == 0xFFFE) { q++;
bo = -1; bo = -1;
continue;
} }
#endif
}
while (q < e) {
register Py_UNICODE ch = *q++;
/* Swap input bytes if needed. (This assumes
sizeof(Py_UNICODE) == 2 !) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
if (bo == -1) if (bo == -1)
ch = (ch >> 8) | (ch << 8); ch = (ch >> 8) | (ch << 8);
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment