SF #989185: Drop unicode.iswide() and unicode.width() and add

unicodedata.east_asian_width(). You can still implement your own simple width() function using it like this: def width(u): w = 0 for c in unicodedata.normalize('NFC', u): cwidth = unicodedata.east_asian_width(c) if cwidth in ('W', 'F'): w += 2 else: w += 1 return w

SF #989185: Drop unicode.iswide() and unicode.width() and add
unicodedata.east_asian_width(). You can still implement your own simple width() function using it like this: def width(u): w = 0 for c in unicodedata.normalize('NFC', u): cwidth = unicodedata.east_asian_width(c) if cwidth in ('W', 'F'): w += 2 else: w += 1 return w
e9ddfbb4 · Hye-Shik Chang · b5047fd0 · e9ddfbb4 · e9ddfbb4 · e9ddfbb4
Commit e9ddfbb4 authored Aug 04, 2004 by Hye-Shik Chang
15 changed files
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -894,11 +894,6 @@ functions depending on the Python configuration.
  character.
 \end{cfuncdesc}

-\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
-  Returns 1/0 depending on whether \var{ch} is a wide or full-width
-  character.
-\end{cfuncdesc}
-
 These APIs can be used for fast direct character conversions:

 \begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
@@ -957,10 +952,6 @@ use these APIs:
  Return the length of the Unicode object.
 \end{cfuncdesc}

-\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
-  Return the fixed-width representation length of the Unicode object.
-\end{cfuncdesc}
-
 \begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
                                                      const char *encoding,
                                                      const char *errors}

--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@@ -664,12 +664,6 @@ there is at least one cased character, false otherwise.
 For 8-bit strings, this method is locale-dependent.
 \end{methoddesc}

-\begin{methoddesc}[string]{iswide}{}
-Return true if all characters in the string are wide or full width and
-there is at least one wide or full width character, false otherwise.
-This method is supported by unicode type only.
-\end{methoddesc}
-
 \begin{methoddesc}[string]{join}{seq}
 Return a string which is the concatenation of the strings in the
 sequence \var{seq}.  The separator between elements is the string
@@ -810,11 +804,6 @@ Return a copy of the string converted to uppercase.
 For 8-bit strings, this method is locale-dependent.
 \end{methoddesc}

-\begin{methoddesc}[string]{width}{}
-Return length of fixed-width representation of the string. This method
-is supported by unicode type only.
-\end{methoddesc}
-
 \begin{methoddesc}[string]{zfill}{width}
 Return the numeric string left filled with zeros in a string
 of length \var{width}. The original string is returned if

--- a/Doc/lib/libunicodedata.tex
+++ b/Doc/lib/libunicodedata.tex
@@ -71,6 +71,11 @@ defines the following functions:
  class is defined.
 \end{funcdesc}

+\begin{funcdesc}{east_asian_width}{unichr}
+  Returns the east asian width of assigned to the Unicode character
+  \var{unichr} as string.
+\end{funcdesc}
+
 \begin{funcdesc}{mirrored}{unichr}
  Returns the mirrored property of assigned to the Unicode character
  \var{unichr} as integer. Returns \code{1} if the character has been
@@ -123,4 +128,4 @@ In addition, the module exposes the following constant:
 The version of the Unicode database used in this module.

 \versionadded{2.3}
-\end{datadesc}
\ No newline at end of file
+\end{datadesc}
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -181,7 +181,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
-# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
 # define PyUnicode_Join PyUnicodeUCS2_Join
 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 # define PyUnicode_Resize PyUnicodeUCS2_Resize
@@ -201,7 +200,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
-# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
@@ -256,7 +254,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
-# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
 # define PyUnicode_Join PyUnicodeUCS4_Join
 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 # define PyUnicode_Resize PyUnicodeUCS4_Resize
@@ -275,7 +272,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
-# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
@@ -321,8 +317,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;

 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)

-#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
-
 #else

 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
@@ -346,8 +340,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;

 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)

-#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
-
 #endif

 #define Py_UNICODE_ISALNUM(ch) \
@@ -440,12 +432,6 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
    PyObject *unicode	 	/* Unicode object */
    );

-/* Get the fixed-width representation length of the Unicode object */
-
-PyAPI_FUNC(int) PyUnicode_GetWidth(
-    PyObject *unicode	 	/* Unicode object */
-    );
-
 /* Get the maximum ordinal for a Unicode character. */
 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);

@@ -1176,10 +1162,6 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
    Py_UNICODE ch 	/* Unicode character */
    );

-PyAPI_FUNC(int) _PyUnicode_IsWide(
-    Py_UNICODE ch 	/* Unicode character */
-    );
-
 #ifdef __cplusplus
 }
 #endif

--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -695,28 +695,3 @@ class MixinStrUserStringTest:

        self.checkraises(TypeError, 'xyz', 'decode', 42)
        self.checkraises(TypeError, 'xyz', 'encode', 42)
-
-
-class MixinUnicodeUserStringTest:
-    # Additional tests that only work with
-    # unicode compatible object, i.e. unicode and UserString
-
-    def test_iswide(self):
-        self.checkequal(False, u'', 'iswide')
-        self.checkequal(False, u'\x1f', 'iswide') # Neutral
-        self.checkequal(False, u'\x20', 'iswide') # Narrow
-        self.checkequal(True, u'\u2329', 'iswide') # Wide
-        self.checkequal(False, u'\uff64', 'iswide') # Half
-        self.checkequal(True, u'\u3000', 'iswide') # Full
-        self.checkequal(False, u'\u2460', 'iswide') # Ambiguous
-        self.checkequal(True, u'\ud55c\uae00', 'iswide')
-        self.checkequal(False, u'\ud55c\u2606\uae00', 'iswide')
-
-    def test_width(self):
-        self.checkequal(0, u'', 'width')
-        self.checkequal(4, u'abcd', 'width')
-        self.checkequal(2, u'\u0187\u01c9', 'width')
-        self.checkequal(3, u'\u2460\u2329', 'width')
-        self.checkequal(3, u'\u2329\u2460', 'width')
-        self.checkequal(4, u'\ud55c\uae00', 'width')
-        self.checkequal(5, u'\ud55c\u2606\uae00', 'width')
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -11,8 +11,7 @@ from test import test_support, string_tests

 class UnicodeTest(
    string_tests.CommonTest,
-    string_tests.MixinStrUnicodeUserStringTest,
-    string_tests.MixinUnicodeUserStringTest
+    string_tests.MixinStrUnicodeUserStringTest
    ):
    type2test = unicode


--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -174,6 +174,17 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
        # The rest can be found in test_normalization.py
        # which requires an external file.

+    def test_east_asian_width(self):
+        eaw = self.db.east_asian_width
+        self.assertRaises(TypeError, eaw, 'a')
+        self.assertRaises(TypeError, eaw, u'')
+        self.assertRaises(TypeError, eaw, u'ra')
+        self.assertEqual(eaw(u'\x1e'), 'N')
+        self.assertEqual(eaw(u'\x20'), 'Na')
+        self.assertEqual(eaw(u'\uC894'), 'W')
+        self.assertEqual(eaw(u'\uFF66'), 'H')
+        self.assertEqual(eaw(u'\uFF1F'), 'F')
+        self.assertEqual(eaw(u'\u2010'), 'A')

 class UnicodeMiscTest(UnicodeDatabaseTest):


--- a/Lib/test/test_userstring.py
+++ b/Lib/test/test_userstring.py
@@ -11,8 +11,7 @@ class UserStringTest(
    string_tests.CommonTest,
    string_tests.MixinStrUnicodeUserStringTest,
    string_tests.MixinStrStringUserStringTest,
-    string_tests.MixinStrUserStringTest,
-    string_tests.MixinUnicodeUserStringTest
+    string_tests.MixinStrUserStringTest
    ):

    type2test = UserString

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,9 @@ Core and builtins
 - Added a workaround for proper string operations in BSDs.  str.split
  and str.is* methods can now work correctly with UTF-8 locales.

+- unicode.iswide() and unicode.width() is dropped and the East Asian
+  Width support is moved to unicodedata extension module.
+
 Extension modules
 -----------------


--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -24,6 +24,8 @@ typedef struct {
    const unsigned char	bidirectional; 	/* index into
 					   _PyUnicode_BidirectionalNames */
    const unsigned char mirrored;	/* true if mirrored in bidir mode */
+    const unsigned char east_asian_width;	/* index into
+						   _PyUnicode_EastAsianWidth */
 } _PyUnicode_DatabaseRecord;

 /* data file generated by Tools/unicode/makeunicodedata.py */
@@ -204,6 +206,24 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
    return PyInt_FromLong((int) _getrecord(v)->mirrored);
 }

+static PyObject *
+unicodedata_east_asian_width(PyObject *self, PyObject *args)
+{
+    PyUnicodeObject *v;
+    int index;
+
+    if (!PyArg_ParseTuple(args, "O!:east_asian_width",
+			  &PyUnicode_Type, &v))
+	return NULL;
+    if (PyUnicode_GET_SIZE(v) != 1) {
+	PyErr_SetString(PyExc_TypeError,
+			"need a single Unicode character as parameter");
+	return NULL;
+    }
+    index = (int) _getrecord(v)->east_asian_width;
+    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
+}
+
 static PyObject *
 unicodedata_decomposition(PyObject *self, PyObject *args)
 {
@@ -871,6 +891,7 @@ static PyMethodDef unicodedata_functions[] = {
    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
    {"combining", unicodedata_combining, METH_VARARGS},
    {"mirrored", unicodedata_mirrored, METH_VARARGS},
+    {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS},
    {"decomposition",unicodedata_decomposition, METH_VARARGS},
    {"name", unicodedata_name, METH_VARARGS},
    {"lookup", unicodedata_lookup, METH_VARARGS},

--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -19,7 +19,6 @@
 #define SPACE_MASK 0x20
 #define TITLE_MASK 0x40
 #define UPPER_MASK 0x80
-#define WIDE_MASK 0x100

 typedef struct {
    const Py_UNICODE upper;
@@ -323,15 +322,6 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
    return 1;
 }

-/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
-
-int _PyUnicode_IsWide(Py_UNICODE ch)
-{
-    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-
-    return (ctype->flags & WIDE_MASK) != 0;
-}
-
 #ifndef WANT_WCTYPE_FUNCTIONS

 /* Returns 1 for Unicode characters having the bidirectional type

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -702,27 +702,6 @@ int PyUnicode_GetSize(PyObject *unicode)
    return -1;
 }

-int PyUnicode_GetWidth(PyObject *unicode)
-{
-    const Py_UNICODE *p, *e;
-    int width;
-
-    if (!PyUnicode_Check(unicode)) {
-	PyErr_BadArgument();
-	return -1;
-    }
-
-    p = PyUnicode_AS_UNICODE(unicode);
-    e = p + PyUnicode_GET_SIZE(unicode);
-    for (width = 0; p < e; p++)
-	if (Py_UNICODE_ISWIDE(*p))
-	    width += 2;
-	else
-	    width++;
-
-    return width;
-}
-
 const char *PyUnicode_GetDefaultEncoding(void)
 {
    return unicode_default_encoding;
@@ -5436,35 +5415,6 @@ unicode_isnumeric(PyUnicodeObject *self)
    return PyBool_FromLong(1);
 }

-PyDoc_STRVAR(iswide__doc__,
-"S.iswide() -> bool\n\
-\n\
-Return True if all characters in S are wide width\n\
-and there is at least one character in S, False otherwise.");
-
-static PyObject*
-unicode_iswide(PyUnicodeObject *self)
-{
-    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
-    register const Py_UNICODE *e;
-
-    /* Shortcut for single character strings */
-    if (PyUnicode_GET_SIZE(self) == 1 &&
-	Py_UNICODE_ISWIDE(*p))
-	Py_RETURN_TRUE;
-
-    /* Special case for empty strings */
-    if (PyString_GET_SIZE(self) == 0)
-	Py_RETURN_FALSE;
-
-    e = p + PyUnicode_GET_SIZE(self);
-    for (; p < e; p++) {
-	if (!Py_UNICODE_ISWIDE(*p))
-	    Py_RETURN_FALSE;
-    }
-    Py_RETURN_TRUE;
-}
-
 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> unicode\n\
 \n\
@@ -6076,21 +6026,6 @@ unicode_upper(PyUnicodeObject *self)
    return fixup(self, fixupper);
 }

-PyDoc_STRVAR(width__doc__,
-"S.width() -> unicode\n\
-\n\
-Return a fixed-width representation length of S.");
-
-static PyObject*
-unicode_width(PyObject *self)
-{
-    int width = PyUnicode_GetWidth(self);
-    if (width == -1)
-	return NULL;
-    else
-	return PyInt_FromLong((long)width);
-}
-
 PyDoc_STRVAR(zfill__doc__,
 "S.zfill(width) -> unicode\n\
 \n\
@@ -6255,8 +6190,6 @@ static PyMethodDef unicode_methods[] = {
    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
-    {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
-    {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
 #if 0
    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},

--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -43,6 +43,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON" ]

+EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
+
 # note: should match definitions in Objects/unicodectype.c
 ALPHA_MASK = 0x01
 DECIMAL_MASK = 0x02
@@ -52,7 +54,6 @@ LINEBREAK_MASK = 0x10
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
-WIDE_MASK = 0x100

 def maketables(trace=0):

@@ -72,7 +73,7 @@ def maketables(trace=0):

 def makeunicodedata(unicode, trace):

-    dummy = (0, 0, 0, 0)
+    dummy = (0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
@@ -91,8 +92,9 @@ def makeunicodedata(unicode, trace):
            combining = int(record[3])
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
+            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
            item = (
-                category, combining, bidirectional, mirrored
+                category, combining, bidirectional, mirrored, eastasianwidth
                )
            # add entry to index and item tables
            i = cache.get(item)
@@ -204,7 +206,7 @@ def makeunicodedata(unicode, trace):
    print >>fp, \
          "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
    for item in table:
-        print >>fp, "    {%d, %d, %d, %d}," % item
+        print >>fp, "    {%d, %d, %d, %d, %d}," % item
    print >>fp, "};"
    print >>fp

@@ -239,6 +241,12 @@ def makeunicodedata(unicode, trace):
    print >>fp, "    NULL"
    print >>fp, "};"

+    print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
+    for name in EASTASIANWIDTH_NAMES:
+        print >>fp, "    \"%s\"," % name
+    print >>fp, "    NULL"
+    print >>fp, "};"
+
    print >>fp, "static const char *decomp_prefix[] = {"
    for name in decomp_prefix:
        print >>fp, "    \"%s\"," % name
@@ -334,8 +342,6 @@ def makeunicodetype(unicode, trace):
            if record[7]:
                flags |= DIGIT_MASK
                digit = int(record[7])
-            if record[15] in ('W', 'F'): # Wide or Full width
-                flags |= WIDE_MASK
            item = (
                upper, lower, title, decimal, digit, flags
                )