Commit 772747b3 authored by Tim Peters's avatar Tim Peters

SF patch #438013 Remove 2-byte Py_UCS2 assumptions

Removed all instances of Py_UCS2 from the codebase, and so also (I hope)
the last remaining reliance on the platform having an integral type
with exactly 16 bits.
PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write
one byte at a time.
parent ab9ba27d
...@@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4; ...@@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4;
typedef unsigned long Py_UCS4; typedef unsigned long Py_UCS4;
#endif #endif
#if SIZEOF_SHORT == 2
typedef unsigned short Py_UCS2;
#else
#error Cannot find a two-byte type
#endif
typedef PY_UNICODE_TYPE Py_UNICODE; typedef PY_UNICODE_TYPE Py_UNICODE;
/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */ /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
......
...@@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) ...@@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
/* --- UTF-16 Codec ------------------------------------------------------- */ /* --- UTF-16 Codec ------------------------------------------------------- */
static static
int utf16_decoding_error(const Py_UCS2 **source, int utf16_decoding_error(Py_UNICODE **dest,
Py_UNICODE **dest,
const char *errors, const char *errors,
const char *details) const char *details)
{ {
...@@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source, ...@@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
} }
} }
PyObject *PyUnicode_DecodeUTF16(const char *s, PyObject *
int size, PyUnicode_DecodeUTF16(const char *s,
const char *errors, int size,
int *byteorder) const char *errors,
int *byteorder)
{ {
PyUnicodeObject *unicode; PyUnicodeObject *unicode;
Py_UNICODE *p; Py_UNICODE *p;
const Py_UCS2 *q, *e; const unsigned char *q, *e;
int bo = 0; int bo = 0; /* assume native ordering by default */
const char *errmsg = ""; const char *errmsg = "";
/* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
/* size should be an even number */ /* size should be an even number */
if (size % sizeof(Py_UCS2) != 0) { if (size & 1) {
if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) if (utf16_decoding_error(NULL, errors, "truncated data"))
return NULL; return NULL;
/* The remaining input chars are ignored if we fall through --size; /* else ignore the oddball byte */
here... */
} }
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
...@@ -1004,48 +1009,54 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, ...@@ -1004,48 +1009,54 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
/* Unpack UTF-16 encoded data */ /* Unpack UTF-16 encoded data */
p = unicode->str; p = unicode->str;
q = (Py_UCS2 *)s; q = (unsigned char *)s;
e = q + (size / sizeof(Py_UCS2)); e = q + size;
if (byteorder) if (byteorder)
bo = *byteorder; bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current /* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */ stream as-is (giving a ZWNBSP character). */
if (bo == 0) { if (bo == 0) {
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN #ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (*q == 0xFEFF) { if (bom == 0xFEFF) {
q++; q += 2;
bo = -1; bo = -1;
} else if (*q == 0xFFFE) { }
q++; else if (bom == 0xFFFE) {
q += 2;
bo = 1; bo = 1;
} }
#else #else
if (*q == 0xFEFF) { if (bom == 0xFEFF) {
q++; q += 2;
bo = 1; bo = 1;
} else if (*q == 0xFFFE) { }
q++; else if (bom == 0xFFFE) {
q += 2;
bo = -1; bo = -1;
} }
#endif #endif
} }
if (bo == -1) {
/* force LE */
ihi = 1;
ilo = 0;
}
else if (bo == 1) {
/* force BE */
ihi = 0;
ilo = 1;
}
while (q < e) { while (q < e) {
register Py_UCS2 ch = *q++; Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
q += 2;
/* Swap input bytes if needed. (This assumes
sizeof(Py_UNICODE) == 2 !) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif
if (ch < 0xD800 || ch > 0xDFFF) { if (ch < 0xD800 || ch > 0xDFFF) {
*p++ = ch; *p++ = ch;
continue; continue;
...@@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, ...@@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
goto utf16Error; goto utf16Error;
} }
if (0xD800 <= ch && ch <= 0xDBFF) { if (0xD800 <= ch && ch <= 0xDBFF) {
Py_UCS2 ch2 = *q++; Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN q += 2;
if (bo == 1)
ch2 = (ch2 >> 8) | (ch2 << 8);
#else
if (bo == -1)
ch2 = (ch2 >> 8) | (ch2 << 8);
#endif
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE #ifndef Py_UNICODE_WIDE
*p++ = ch; *p++ = ch;
...@@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, ...@@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
/* Fall through to report the error */ /* Fall through to report the error */
utf16Error: utf16Error:
if (utf16_decoding_error(&q, &p, errors, errmsg)) if (utf16_decoding_error(&p, errors, errmsg))
goto onError; goto onError;
} }
...@@ -1102,58 +1107,67 @@ onError: ...@@ -1102,58 +1107,67 @@ onError:
return NULL; return NULL;
} }
#undef UTF16_ERROR PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, int size,
int size, const char *errors,
const char *errors, int byteorder)
int byteorder)
{ {
PyObject *v; PyObject *v;
Py_UCS2 *p; unsigned char *p;
char *q; int i, pairs;
int i, pairs, doswap = 1; /* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int ihi = 1, ilo = 0;
#else
int ihi = 0, ilo = 1;
#endif
#define STORECHAR(CH) \
do { \
p[ihi] = ((CH) >> 8) & 0xff; \
p[ilo] = (CH) & 0xff; \
p += 2; \
} while(0)
for (i = pairs = 0; i < size; i++) for (i = pairs = 0; i < size; i++)
if (s[i] >= 0x10000) if (s[i] >= 0x10000)
pairs++; pairs++;
v = PyString_FromStringAndSize(NULL, v = PyString_FromStringAndSize(NULL,
sizeof(Py_UCS2) * (size + pairs + (byteorder == 0))); 2 * (size + pairs + (byteorder == 0)));
if (v == NULL) if (v == NULL)
return NULL; return NULL;
q = PyString_AS_STRING(v); p = (unsigned char *)PyString_AS_STRING(v);
p = (Py_UCS2 *)q;
if (byteorder == 0) if (byteorder == 0)
*p++ = 0xFEFF; STORECHAR(0xFEFF);
if (size == 0) if (size == 0)
return v; return v;
if (byteorder == 0 ||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN if (byteorder == -1) {
byteorder == -1 /* force LE */
#else ihi = 1;
byteorder == 1 ilo = 0;
#endif }
) else if (byteorder == 1) {
doswap = 0; /* force BE */
ihi = 0;
ilo = 1;
}
while (size-- > 0) { while (size-- > 0) {
Py_UNICODE ch = *s++; Py_UNICODE ch = *s++;
Py_UNICODE ch2 = 0; Py_UNICODE ch2 = 0;
if (ch >= 0x10000) { if (ch >= 0x10000) {
ch2 = 0xDC00|((ch-0x10000) & 0x3FF); ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
ch = 0xD800|((ch-0x10000)>>10); ch = 0xD800 | ((ch-0x10000) >> 10);
}
if (doswap){
*p++ = (ch >> 8) | (ch << 8);
if (ch2)
*p++ = (ch2 >> 8) | (ch2 << 8);
}else{
*p++ = ch;
if(ch2)
*p++ = ch2;
} }
STORECHAR(ch);
if (ch2)
STORECHAR(ch2);
} }
return v; return v;
#undef STORECHAR
} }
PyObject *PyUnicode_AsUTF16String(PyObject *unicode) PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment