Commit e9ddfbb4 authored by Hye-Shik Chang's avatar Hye-Shik Chang

SF #989185: Drop unicode.iswide() and unicode.width() and add

unicodedata.east_asian_width().  You can still implement your own
simple width() function using it like this:
    def width(u):
        w = 0
        for c in unicodedata.normalize('NFC', u):
            cwidth = unicodedata.east_asian_width(c)
            if cwidth in ('W', 'F'): w += 2
            else: w += 1
        return w
parent b5047fd0
......@@ -894,11 +894,6 @@ functions depending on the Python configuration.
character.
\end{cfuncdesc}
\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
Returns 1/0 depending on whether \var{ch} is a wide or full-width
character.
\end{cfuncdesc}
These APIs can be used for fast direct character conversions:
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
......@@ -957,10 +952,6 @@ use these APIs:
Return the length of the Unicode object.
\end{cfuncdesc}
\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
Return the fixed-width representation length of the Unicode object.
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
const char *encoding,
const char *errors}
......
......@@ -664,12 +664,6 @@ there is at least one cased character, false otherwise.
For 8-bit strings, this method is locale-dependent.
\end{methoddesc}
\begin{methoddesc}[string]{iswide}{}
Return true if all characters in the string are wide or full width and
there is at least one wide or full width character, false otherwise.
This method is supported by unicode type only.
\end{methoddesc}
\begin{methoddesc}[string]{join}{seq}
Return a string which is the concatenation of the strings in the
sequence \var{seq}. The separator between elements is the string
......@@ -810,11 +804,6 @@ Return a copy of the string converted to uppercase.
For 8-bit strings, this method is locale-dependent.
\end{methoddesc}
\begin{methoddesc}[string]{width}{}
Return length of fixed-width representation of the string. This method
is supported by unicode type only.
\end{methoddesc}
\begin{methoddesc}[string]{zfill}{width}
Return the numeric string left filled with zeros in a string
of length \var{width}. The original string is returned if
......
......@@ -71,6 +71,11 @@ defines the following functions:
class is defined.
\end{funcdesc}
\begin{funcdesc}{east_asian_width}{unichr}
Returns the east asian width of assigned to the Unicode character
\var{unichr} as string.
\end{funcdesc}
\begin{funcdesc}{mirrored}{unichr}
Returns the mirrored property of assigned to the Unicode character
\var{unichr} as integer. Returns \code{1} if the character has been
......@@ -123,4 +128,4 @@ In addition, the module exposes the following constant:
The version of the Unicode database used in this module.
\versionadded{2.3}
\end{datadesc}
\ No newline at end of file
\end{datadesc}
......@@ -181,7 +181,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
# define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize
......@@ -201,7 +200,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
......@@ -256,7 +254,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
# define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize
......@@ -275,7 +272,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
......@@ -321,8 +317,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
#else
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
......@@ -346,8 +340,6 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
#endif
#define Py_UNICODE_ISALNUM(ch) \
......@@ -440,12 +432,6 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
PyObject *unicode /* Unicode object */
);
/* Get the fixed-width representation length of the Unicode object */
PyAPI_FUNC(int) PyUnicode_GetWidth(
PyObject *unicode /* Unicode object */
);
/* Get the maximum ordinal for a Unicode character. */
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
......@@ -1176,10 +1162,6 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Py_UNICODE ch /* Unicode character */
);
PyAPI_FUNC(int) _PyUnicode_IsWide(
Py_UNICODE ch /* Unicode character */
);
#ifdef __cplusplus
}
#endif
......
......@@ -695,28 +695,3 @@ class MixinStrUserStringTest:
self.checkraises(TypeError, 'xyz', 'decode', 42)
self.checkraises(TypeError, 'xyz', 'encode', 42)
class MixinUnicodeUserStringTest:
# Additional tests that only work with
# unicode compatible object, i.e. unicode and UserString
def test_iswide(self):
self.checkequal(False, u'', 'iswide')
self.checkequal(False, u'\x1f', 'iswide') # Neutral
self.checkequal(False, u'\x20', 'iswide') # Narrow
self.checkequal(True, u'\u2329', 'iswide') # Wide
self.checkequal(False, u'\uff64', 'iswide') # Half
self.checkequal(True, u'\u3000', 'iswide') # Full
self.checkequal(False, u'\u2460', 'iswide') # Ambiguous
self.checkequal(True, u'\ud55c\uae00', 'iswide')
self.checkequal(False, u'\ud55c\u2606\uae00', 'iswide')
def test_width(self):
self.checkequal(0, u'', 'width')
self.checkequal(4, u'abcd', 'width')
self.checkequal(2, u'\u0187\u01c9', 'width')
self.checkequal(3, u'\u2460\u2329', 'width')
self.checkequal(3, u'\u2329\u2460', 'width')
self.checkequal(4, u'\ud55c\uae00', 'width')
self.checkequal(5, u'\ud55c\u2606\uae00', 'width')
......@@ -11,8 +11,7 @@ from test import test_support, string_tests
class UnicodeTest(
string_tests.CommonTest,
string_tests.MixinStrUnicodeUserStringTest,
string_tests.MixinUnicodeUserStringTest
string_tests.MixinStrUnicodeUserStringTest
):
type2test = unicode
......
......@@ -174,6 +174,17 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
# The rest can be found in test_normalization.py
# which requires an external file.
def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, 'a')
self.assertRaises(TypeError, eaw, u'')
self.assertRaises(TypeError, eaw, u'ra')
self.assertEqual(eaw(u'\x1e'), 'N')
self.assertEqual(eaw(u'\x20'), 'Na')
self.assertEqual(eaw(u'\uC894'), 'W')
self.assertEqual(eaw(u'\uFF66'), 'H')
self.assertEqual(eaw(u'\uFF1F'), 'F')
self.assertEqual(eaw(u'\u2010'), 'A')
class UnicodeMiscTest(UnicodeDatabaseTest):
......
......@@ -11,8 +11,7 @@ class UserStringTest(
string_tests.CommonTest,
string_tests.MixinStrUnicodeUserStringTest,
string_tests.MixinStrStringUserStringTest,
string_tests.MixinStrUserStringTest,
string_tests.MixinUnicodeUserStringTest
string_tests.MixinStrUserStringTest
):
type2test = UserString
......
......@@ -67,6 +67,9 @@ Core and builtins
- Added a workaround for proper string operations in BSDs. str.split
and str.is* methods can now work correctly with UTF-8 locales.
- unicode.iswide() and unicode.width() is dropped and the East Asian
Width support is moved to unicodedata extension module.
Extension modules
-----------------
......
......@@ -24,6 +24,8 @@ typedef struct {
const unsigned char bidirectional; /* index into
_PyUnicode_BidirectionalNames */
const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */
} _PyUnicode_DatabaseRecord;
/* data file generated by Tools/unicode/makeunicodedata.py */
......@@ -204,6 +206,24 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
return PyInt_FromLong((int) _getrecord(v)->mirrored);
}
static PyObject *
unicodedata_east_asian_width(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:east_asian_width",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
index = (int) _getrecord(v)->east_asian_width;
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
}
static PyObject *
unicodedata_decomposition(PyObject *self, PyObject *args)
{
......@@ -871,6 +891,7 @@ static PyMethodDef unicodedata_functions[] = {
{"bidirectional", unicodedata_bidirectional, METH_VARARGS},
{"combining", unicodedata_combining, METH_VARARGS},
{"mirrored", unicodedata_mirrored, METH_VARARGS},
{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS},
{"decomposition",unicodedata_decomposition, METH_VARARGS},
{"name", unicodedata_name, METH_VARARGS},
{"lookup", unicodedata_lookup, METH_VARARGS},
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -19,7 +19,6 @@
#define SPACE_MASK 0x20
#define TITLE_MASK 0x40
#define UPPER_MASK 0x80
#define WIDE_MASK 0x100
typedef struct {
const Py_UNICODE upper;
......@@ -323,15 +322,6 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
return 1;
}
/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
int _PyUnicode_IsWide(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
return (ctype->flags & WIDE_MASK) != 0;
}
#ifndef WANT_WCTYPE_FUNCTIONS
/* Returns 1 for Unicode characters having the bidirectional type
......
......@@ -702,27 +702,6 @@ int PyUnicode_GetSize(PyObject *unicode)
return -1;
}
int PyUnicode_GetWidth(PyObject *unicode)
{
const Py_UNICODE *p, *e;
int width;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return -1;
}
p = PyUnicode_AS_UNICODE(unicode);
e = p + PyUnicode_GET_SIZE(unicode);
for (width = 0; p < e; p++)
if (Py_UNICODE_ISWIDE(*p))
width += 2;
else
width++;
return width;
}
const char *PyUnicode_GetDefaultEncoding(void)
{
return unicode_default_encoding;
......@@ -5436,35 +5415,6 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(1);
}
PyDoc_STRVAR(iswide__doc__,
"S.iswide() -> bool\n\
\n\
Return True if all characters in S are wide width\n\
and there is at least one character in S, False otherwise.");
static PyObject*
unicode_iswide(PyUnicodeObject *self)
{
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
register const Py_UNICODE *e;
/* Shortcut for single character strings */
if (PyUnicode_GET_SIZE(self) == 1 &&
Py_UNICODE_ISWIDE(*p))
Py_RETURN_TRUE;
/* Special case for empty strings */
if (PyString_GET_SIZE(self) == 0)
Py_RETURN_FALSE;
e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) {
if (!Py_UNICODE_ISWIDE(*p))
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
}
PyDoc_STRVAR(join__doc__,
"S.join(sequence) -> unicode\n\
\n\
......@@ -6076,21 +6026,6 @@ unicode_upper(PyUnicodeObject *self)
return fixup(self, fixupper);
}
PyDoc_STRVAR(width__doc__,
"S.width() -> unicode\n\
\n\
Return a fixed-width representation length of S.");
static PyObject*
unicode_width(PyObject *self)
{
int width = PyUnicode_GetWidth(self);
if (width == -1)
return NULL;
else
return PyInt_FromLong((long)width);
}
PyDoc_STRVAR(zfill__doc__,
"S.zfill(width) -> unicode\n\
\n\
......@@ -6255,8 +6190,6 @@ static PyMethodDef unicode_methods[] = {
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
{"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
{"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
#if 0
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
......
This diff is collapsed.
......@@ -43,6 +43,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON" ]
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
# note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
......@@ -52,7 +54,6 @@ LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
WIDE_MASK = 0x100
def maketables(trace=0):
......@@ -72,7 +73,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0)
dummy = (0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
......@@ -91,8 +92,9 @@ def makeunicodedata(unicode, trace):
combining = int(record[3])
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
item = (
category, combining, bidirectional, mirrored
category, combining, bidirectional, mirrored, eastasianwidth
)
# add entry to index and item tables
i = cache.get(item)
......@@ -204,7 +206,7 @@ def makeunicodedata(unicode, trace):
print >>fp, \
"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table:
print >>fp, " {%d, %d, %d, %d}," % item
print >>fp, " {%d, %d, %d, %d, %d}," % item
print >>fp, "};"
print >>fp
......@@ -239,6 +241,12 @@ def makeunicodedata(unicode, trace):
print >>fp, " NULL"
print >>fp, "};"
print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
for name in EASTASIANWIDTH_NAMES:
print >>fp, " \"%s\"," % name
print >>fp, " NULL"
print >>fp, "};"
print >>fp, "static const char *decomp_prefix[] = {"
for name in decomp_prefix:
print >>fp, " \"%s\"," % name
......@@ -334,8 +342,6 @@ def makeunicodetype(unicode, trace):
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
if record[15] in ('W', 'F'): # Wide or Full width
flags |= WIDE_MASK
item = (
upper, lower, title, decimal, digit, flags
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment