#5127: Even on narrow unicode builds, the C functions that access the Unicode

Database (Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others) now accept and return characters from the full Unicode range (Py_UCS4). The differences from Python code are few: - unicodedata.numeric(), unicodedata.decimal() and unicodedata.digit() now return the correct value for large code points - repr() may consider more characters as printable.

#5127: Even on narrow unicode builds, the C functions that access the Unicode
Database (Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others) now accept and return characters from the full Unicode range (Py_UCS4). The differences from Python code are few: - unicodedata.numeric(), unicodedata.decimal() and unicodedata.digit() now return the correct value for large code points - repr() may consider more characters as printable.
8d5b2896 · Amaury Forgeot d'Arc · 83123ede · 8d5b2896 · 8d5b2896 · 8d5b2896
Commit 8d5b2896 authored Aug 18, 2010 by Amaury Forgeot d'Arc
7 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -221,24 +221,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
 # define _PyUnicode_Init _PyUnicodeUCS2_Init
-# define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
-# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
-# define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
-# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
-# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
-# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
-# define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable
-# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
-# define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
-# define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
-# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
-# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
-# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
-# define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
-# define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
-# define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
-# define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
-# define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase

 #else

@@ -322,24 +304,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
 # define _PyUnicode_Init _PyUnicodeUCS4_Init
-# define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
-# define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
-# define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
-# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
-# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
-# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
-# define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable
-# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
-# define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
-# define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
-# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
-# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
-# define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
-# define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
-# define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
-# define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
-# define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
-# define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase


 #endif
@@ -351,7 +315,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
   configure Python using --with-wctype-functions.  This reduces the
   interpreter's code size. */

-#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
+#if defined(Py_UNICODE_WIDE) && defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)

 #include <wctype.h>

@@ -1542,75 +1506,75 @@ PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
 */

 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
-    const Py_UNICODE ch         /* Unicode character */
+    const Py_UCS4 ch         /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
-    const Py_UNICODE ch         /* Unicode character */
+    const Py_UCS4 ch         /* Unicode character */
    );

-PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
-    Py_UNICODE ch       /* Unicode character */
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
+    Py_UCS4 ch       /* Unicode character */
    );

-PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
-    Py_UNICODE ch       /* Unicode character */
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
+    Py_UCS4 ch       /* Unicode character */
    );

-PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
-    Py_UNICODE ch       /* Unicode character */
+PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_ToDigit(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsDigit(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
-    Py_UNICODE ch       /* Unicode character */
+    Py_UCS4 ch       /* Unicode character */
    );

 PyAPI_FUNC(size_t) Py_UNICODE_strlen(

--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1353,6 +1353,10 @@ class UnicodeTest(string_tests.CommonTest,
        self.assertEqual(repr(s1()), '\\n')
        self.assertEqual(repr(s2()), '\\n')

+    def test_printable_repr(self):
+        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
+        self.assertEqual(repr('\U00011000'), "'\\U00011000'")     # nonprintable
+
    def test_expandtabs_overflows_gracefully(self):
        # This test only affects 32-bit platforms because expandtabs can only take
        # an int as the max value, not a 64-bit C long.  If expandtabs is changed

--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -294,6 +294,12 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
                self.assertEqual(len(lines), 1,
                                 r"\u%.4x should not be a linebreak" % i)

+    def test_UCS4(self):
+        # unicodedata should work with code points outside the BMP
+        # even on a narrow Unicode build
+        self.assertEqual(self.db.category(u"\U0001012A"), "No")
+        self.assertEqual(self.db.numeric(u"\U0001012A"), 9000)
+
 def test_main():
    test.support.run_unittest(
        UnicodeMiscTest,

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,12 @@ What's New in Python 3.2 Alpha 2?
 Core and Builtins
 -----------------

+- Issue #5127: The C functions that access the Unicode Database now accept and
+  return characters from the full Unicode range, even on narrow unicode builds
+  (Py_UNICODE_TOLOWER, Py_UNICODE_ISDECIMAL, and others).  A visible difference
+  in Python is that unicodedata.numeric() now returns the correct value for
+  large code points, and repr() may consider more characters as printable.
+
 - Issue #9425: Create PyModule_GetFilenameObject() function to get the filename
  as a unicode object, instead of a byte string. Function needed to support
  unencodable filenames. Deprecate PyModule_GetFilename() in favor on the new

--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -26,9 +26,9 @@
 #define NUMERIC_MASK 0x1000

 typedef struct {
-    const Py_UNICODE upper;
-    const Py_UNICODE lower;
-    const Py_UNICODE title;
+    const Py_UCS4 upper;
+    const Py_UCS4 lower;
+    const Py_UCS4 title;
    const unsigned char decimal;
    const unsigned char digit;
    const unsigned short flags;
@@ -37,15 +37,13 @@ typedef struct {
 #include "unicodetype_db.h"

 static const _PyUnicode_TypeRecord *
-gettyperecord(Py_UNICODE code)
+gettyperecord(Py_UCS4 code)
 {
    int index;

-#ifdef Py_UNICODE_WIDE
    if (code >= 0x110000)
        index = 0;
    else
-#endif
    {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
@@ -57,7 +55,7 @@ gettyperecord(Py_UNICODE code)
 /* Returns the titlecase Unicode characters corresponding to ch or just
   ch if no titlecase mapping is known. */

-Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
+Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->title;
@@ -74,7 +72,7 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the category 'Lt', 0
   otherwise. */

-int _PyUnicode_IsTitlecase(Py_UNICODE ch)
+int _PyUnicode_IsTitlecase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -84,7 +82,7 @@ int _PyUnicode_IsTitlecase(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the XID_Start property, 0
   otherwise. */

-int _PyUnicode_IsXidStart(Py_UNICODE ch)
+int _PyUnicode_IsXidStart(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -94,7 +92,7 @@ int _PyUnicode_IsXidStart(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the XID_Continue property,
   0 otherwise. */

-int _PyUnicode_IsXidContinue(Py_UNICODE ch)
+int _PyUnicode_IsXidContinue(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -104,14 +102,14 @@ int _PyUnicode_IsXidContinue(Py_UNICODE ch)
 /* Returns the integer decimal (0-9) for Unicode characters having
   this property, -1 otherwise. */

-int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
+int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
 }

-int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
+int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
 {
    if (_PyUnicode_ToDecimalDigit(ch) < 0)
        return 0;
@@ -121,14 +119,14 @@ int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
 /* Returns the integer digit (0-9) for Unicode characters having
   this property, -1 otherwise. */

-int _PyUnicode_ToDigit(Py_UNICODE ch)
+int _PyUnicode_ToDigit(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

    return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
 }

-int _PyUnicode_IsDigit(Py_UNICODE ch)
+int _PyUnicode_IsDigit(Py_UCS4 ch)
 {
    if (_PyUnicode_ToDigit(ch) < 0)
        return 0;
@@ -138,7 +136,7 @@ int _PyUnicode_IsDigit(Py_UNICODE ch)
 /* Returns the numeric value as double for Unicode characters having
   this property, -1.0 otherwise. */

-int _PyUnicode_IsNumeric(Py_UNICODE ch)
+int _PyUnicode_IsNumeric(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -158,7 +156,7 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
      * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
      * Zs (Separator, Space) other than ASCII space('\x20').
 */
-int _PyUnicode_IsPrintable(Py_UNICODE ch)
+int _PyUnicode_IsPrintable(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -170,7 +168,7 @@ int _PyUnicode_IsPrintable(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the category 'Ll', 0
   otherwise. */

-int _PyUnicode_IsLowercase(Py_UNICODE ch)
+int _PyUnicode_IsLowercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -180,7 +178,7 @@ int _PyUnicode_IsLowercase(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the category 'Lu', 0
   otherwise. */

-int _PyUnicode_IsUppercase(Py_UNICODE ch)
+int _PyUnicode_IsUppercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -190,7 +188,7 @@ int _PyUnicode_IsUppercase(Py_UNICODE ch)
 /* Returns the uppercase Unicode characters corresponding to ch or just
   ch if no uppercase mapping is known. */

-Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
+Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->upper;
@@ -204,7 +202,7 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 /* Returns the lowercase Unicode characters corresponding to ch or just
   ch if no lowercase mapping is known. */

-Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
+Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->lower;
@@ -218,7 +216,7 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
   'Lo' or 'Lm',  0 otherwise. */

-int _PyUnicode_IsAlpha(Py_UNICODE ch)
+int _PyUnicode_IsAlpha(Py_UCS4 ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);

@@ -230,27 +228,27 @@ int _PyUnicode_IsAlpha(Py_UNICODE ch)
 /* Export the interfaces using the wchar_t type for portability
   reasons:  */

-int _PyUnicode_IsLowercase(Py_UNICODE ch)
+int _PyUnicode_IsLowercase(Py_UCS4 ch)
 {
    return iswlower(ch);
 }

-int _PyUnicode_IsUppercase(Py_UNICODE ch)
+int _PyUnicode_IsUppercase(Py_UCS4 ch)
 {
    return iswupper(ch);
 }

-Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
+Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
 {
    return towlower(ch);
 }

-Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
+Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
 {
    return towupper(ch);
 }

-int _PyUnicode_IsAlpha(Py_UNICODE ch)
+int _PyUnicode_IsAlpha(Py_UCS4 ch)
 {
    return iswalpha(ch);
 }

--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -28,7 +28,7 @@
 import sys

 SCRIPT = sys.argv[0]
-VERSION = "2.6"
+VERSION = "3.2"

 # The Unicode Database
 UNIDATA_VERSION = "5.2.0"
@@ -479,7 +479,7 @@ def makeunicodetype(unicode, trace):
    print('/* Returns the numeric value as double for Unicode characters', file=fp)
    print(' * having this property, -1.0 otherwise.', file=fp)
    print(' */', file=fp)
-    print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
+    print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
    print('{', file=fp)
    print('    switch (ch) {', file=fp)
    for value, codepoints in numeric_items:
@@ -488,21 +488,10 @@ def makeunicodetype(unicode, trace):
        parts = [repr(float(part)) for part in parts]
        value = '/'.join(parts)

-        haswide = False
-        hasnonewide = False
        codepoints.sort()
        for codepoint in codepoints:
-            if codepoint < 0x10000:
-                hasnonewide = True
-            if codepoint >= 0x10000 and not haswide:
-                print('#ifdef Py_UNICODE_WIDE', file=fp)
-                haswide = True
            print('    case 0x%04X:' % (codepoint,), file=fp)
-        if haswide and hasnonewide:
-            print('#endif', file=fp)
        print('        return (double) %s;' % (value,), file=fp)
-        if haswide and not hasnonewide:
-            print('#endif', file=fp)
    print('    }', file=fp)
    print('    return -1.0;', file=fp)
    print('}', file=fp)
@@ -512,27 +501,16 @@ def makeunicodetype(unicode, trace):
    print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
    print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
    print(" */", file=fp)
-    print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
+    print('int _PyUnicode_IsWhitespace(register const Py_UCS4 ch)', file=fp)
    print('{', file=fp)
    print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
    print('    return iswspace(ch);', file=fp)
    print('#else', file=fp)
    print('    switch (ch) {', file=fp)

-    haswide = False
-    hasnonewide = False
    for codepoint in sorted(spaces):
-        if codepoint < 0x10000:
-            hasnonewide = True
-        if codepoint >= 0x10000 and not haswide:
-            print('#ifdef Py_UNICODE_WIDE', file=fp)
-            haswide = True
        print('    case 0x%04X:' % (codepoint,), file=fp)
-    if haswide and hasnonewide:
-        print('#endif', file=fp)
    print('        return 1;', file=fp)
-    if haswide and not hasnonewide:
-        print('#endif', file=fp)

    print('    }', file=fp)
    print('    return 0;', file=fp)
@@ -545,23 +523,12 @@ def makeunicodetype(unicode, trace):
    print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
    print(" * type 'B', 0 otherwise.", file=fp)
    print(" */", file=fp)
-    print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
+    print('int _PyUnicode_IsLinebreak(register const Py_UCS4 ch)', file=fp)
    print('{', file=fp)
    print('    switch (ch) {', file=fp)
-    haswide = False
-    hasnonewide = False
    for codepoint in sorted(linebreaks):
-        if codepoint < 0x10000:
-            hasnonewide = True
-        if codepoint >= 0x10000 and not haswide:
-            print('#ifdef Py_UNICODE_WIDE', file=fp)
-            haswide = True
        print('    case 0x%04X:' % (codepoint,), file=fp)
-    if haswide and hasnonewide:
-        print('#endif', file=fp)
    print('        return 1;', file=fp)
-    if haswide and not hasnonewide:
-        print('#endif', file=fp)

    print('    }', file=fp)
    print('    return 0;', file=fp)