SF patch #438013 Remove 2-byte Py_UCS2 assumptions

Removed all instances of Py_UCS2 from the codebase, and so also (I hope) the last remaining reliance on the platform having an integral type with exactly 16 bits. PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write one byte at a time.

SF patch #438013 Remove 2-byte Py_UCS2 assumptions
Removed all instances of Py_UCS2 from the codebase, and so also (I hope) the last remaining reliance on the platform having an integral type with exactly 16 bits. PyUnicode_DecodeUTF16() and PyUnicode_EncodeUTF16() now read and write one byte at a time.
772747b3 · Tim Peters · ab9ba27d · 772747b3 · 772747b3
Commit 772747b3 authored Aug 09, 2001 by Tim Peters
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 82 deletions

Include/unicodeobject.h Include/unicodeobject.h +0 -6

Objects/unicodeobject.c Objects/unicodeobject.c +90 -76

No files found.
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4;
 typedef unsigned long Py_UCS4; 
 #endif
-#if SIZEOF_SHORT == 2
-typedef unsigned short Py_UCS2;
-#else
-#error Cannot find a two-byte type
-#endif 
 typedef PY_UNICODE_TYPE Py_UNICODE;
 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 /* --- UTF-16 Codec ------------------------------------------------------- */
 static
-int utf16_decoding_error(const Py_UCS2 **source,
+int utf16_decoding_error(Py_UNICODE **dest,
-			 Py_UNICODE **dest,
 			 const char *errors,
 			 const char *details) 
 {
@@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
    }
 }
-PyObject *PyUnicode_DecodeUTF16(const char *s,
+PyObject *
-				int size,
+PyUnicode_DecodeUTF16(const char *s,
-				const char *errors,
+		      int size,
-				int *byteorder)
+		      const char *errors,
+		      int *byteorder)
 {
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
-    const Py_UCS2 *q, *e;
+    const unsigned char *q, *e;
-    int bo = 0;
+    int bo = 0;       /* assume native ordering by default */
    const char *errmsg = "";
+    /* Offsets from q for retrieving byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif
    /* size should be an even number */
-    if (size % sizeof(Py_UCS2) != 0) {
+    if (size & 1) {
-	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
+        if (utf16_decoding_error(NULL, errors, "truncated data"))
-	    return NULL;
+            return NULL;
-	/* The remaining input chars are ignored if we fall through
+        --size;  /* else ignore the oddball byte */
-           here... */
    }
    /* Note: size will always be longer than the resulting Unicode
@@ -1004,48 +1009,54 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
    /* Unpack UTF-16 encoded data */
    p = unicode->str;
-    q = (Py_UCS2 *)s;
+    q = (unsigned char *)s;
-    e = q + (size / sizeof(Py_UCS2));
+    e = q + size;
    if (byteorder)
-	bo = *byteorder;
+        bo = *byteorder;
    /* Check for BOM marks (U+FEFF) in the input and adjust current
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
+        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (*q == 0xFEFF) {
+	if (bom == 0xFEFF) {
-	    q++;
+	    q += 2;
 	    bo = -1;
-	} else if (*q == 0xFFFE) {
+	}
-	    q++;
+        else if (bom == 0xFFFE) {
+	    q += 2;
 	    bo = 1;
 	}
 #else    
-	if (*q == 0xFEFF) {
+	if (bom == 0xFEFF) {
-	    q++;
+	    q += 2;
 	    bo = 1;
-	} else if (*q == 0xFFFE) {
+	}
-	    q++;
+        else if (bom == 0xFFFE) {
+	    q += 2;
 	    bo = -1;
 	}
 #endif
    }
+    if (bo == -1) {
+        /* force LE */
+        ihi = 1;
+        ilo = 0;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
    while (q < e) {
-	register Py_UCS2 ch = *q++;
+	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+	q += 2;
-	/* Swap input bytes if needed. (This assumes
-	   sizeof(Py_UNICODE) == 2 !) */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (bo == 1)
-	    ch = (ch >> 8) | (ch << 8);
-#else    
-	if (bo == -1)
-	    ch = (ch >> 8) | (ch << 8);
-#endif
 	if (ch < 0xD800 || ch > 0xDFFF) {
 	    *p++ = ch;
 	    continue;
@@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	    goto utf16Error;
 	}
 	if (0xD800 <= ch && ch <= 0xDBFF) {
-	    Py_UCS2 ch2 = *q++;
+	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+	    q += 2;
-	    if (bo == 1)
-		    ch2 = (ch2 >> 8) | (ch2 << 8);
-#else    
-	    if (bo == -1)
-		    ch2 = (ch2 >> 8) | (ch2 << 8);
-#endif
 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 #ifndef Py_UNICODE_WIDE
 		*p++ = ch;
@@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
 	/* Fall through to report the error */
    utf16Error:
-	if (utf16_decoding_error(&q, &p, errors, errmsg))
+	if (utf16_decoding_error(&p, errors, errmsg))
 	    goto onError;
    }
@@ -1102,58 +1107,67 @@ onError:
    return NULL;
 }
-#undef UTF16_ERROR
+PyObject *
+PyUnicode_EncodeUTF16(const Py_UNICODE *s,
-PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
+		      int size,
-				int size,
+		      const char *errors,
-				const char *errors,
+		      int byteorder)
-				int byteorder)
 {
    PyObject *v;
-    Py_UCS2 *p;
+    unsigned char *p;
-    char *q;
+    int i, pairs;
-    int i, pairs, doswap = 1;
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif
+#define STORECHAR(CH)                   \
+    do {                                \
+        p[ihi] = ((CH) >> 8) & 0xff;    \
+        p[ilo] = (CH) & 0xff;           \
+        p += 2;                         \
+    } while(0)
    for (i = pairs = 0; i < size; i++)
 	if (s[i] >= 0x10000)
 	    pairs++;
    v = PyString_FromStringAndSize(NULL, 
-		  sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
+		  2 * (size + pairs + (byteorder == 0)));
    if (v == NULL)
        return NULL;
-    q = PyString_AS_STRING(v);
+    p = (unsigned char *)PyString_AS_STRING(v);
-    p = (Py_UCS2 *)q;
    if (byteorder == 0)
-	*p++ = 0xFEFF;
+	STORECHAR(0xFEFF);
    if (size == 0)
        return v;
-    if (byteorder == 0 ||
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN	
+    if (byteorder == -1) {
-	byteorder == -1
+        /* force LE */
-#else
+        ihi = 1;
-	byteorder == 1
+        ilo = 0;
-#endif
+    }
-	)
+    else if (byteorder == 1) {
-	doswap = 0;
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
    while (size-- > 0) {
 	Py_UNICODE ch = *s++;
 	Py_UNICODE ch2 = 0;
 	if (ch >= 0x10000) {
-	    ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
+	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
-	    ch  = 0xD800|((ch-0x10000)>>10);
+	    ch  = 0xD800 | ((ch-0x10000) >> 10);
-	}
-	if (doswap){
-	    *p++ = (ch >> 8) | (ch << 8);
-	    if (ch2)
-		*p++ = (ch2 >> 8) | (ch2 << 8);
-	}else{
-	    *p++ = ch;
-	    if(ch2)
-		*p++ = ch2;
 	}
+        STORECHAR(ch);
+        if (ch2)
+            STORECHAR(ch2);
    }
    return v;
+#undef STORECHAR
 }
 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)