Commit 93e7afc5 authored by Ezio Melotti's avatar Ezio Melotti

#9200: The str.is* methods now work with strings that contain non-BMP...

#9200: The str.is* methods now work with strings that contain non-BMP characters even in narrow Unicode builds.
parent 3d3bc812
This diff is collapsed.
...@@ -10,6 +10,9 @@ What's New in Python 3.2.3? ...@@ -10,6 +10,9 @@ What's New in Python 3.2.3?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9200: The str.is* methods now work with strings that contain non-BMP
characters even in narrow Unicode builds.
- Issue #12791: Break reference cycles early when a generator exits with - Issue #12791: Break reference cycles early when a generator exits with
an exception. an exception.
......
...@@ -6442,6 +6442,37 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, ...@@ -6442,6 +6442,37 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
start = 0; \ start = 0; \
} }
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
* by 'ptr', possibly combining surrogate pairs on narrow builds.
* 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
* that should be returned and 'end' pointing to the end of the buffer.
* ('end' is used on narrow builds to detect a lone surrogate at the
* end of the buffer that should be returned unchanged.)
* The ptr and end arguments should be side-effect free and ptr must an lvalue.
* The type of the returned char is always Py_UCS4.
*
* Note: the macro advances ptr to next char, so it might have side-effects
* (especially if used with other macros).
*/
/* helper macros used by _Py_UNICODE_NEXT */
#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
#ifdef Py_UNICODE_WIDE
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
#else
#define _Py_UNICODE_NEXT(ptr, end) \
(((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
_Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
(Py_UCS4)*(ptr)++)
#endif
Py_ssize_t PyUnicode_Count(PyObject *str, Py_ssize_t PyUnicode_Count(PyObject *str,
PyObject *substr, PyObject *substr,
Py_ssize_t start, Py_ssize_t start,
...@@ -7705,8 +7736,8 @@ unicode_islower(PyUnicodeObject *self) ...@@ -7705,8 +7736,8 @@ unicode_islower(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
cased = 0; cased = 0;
for (; p < e; p++) { while (p < e) {
register const Py_UNICODE ch = *p; const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
return PyBool_FromLong(0); return PyBool_FromLong(0);
...@@ -7739,8 +7770,8 @@ unicode_isupper(PyUnicodeObject *self) ...@@ -7739,8 +7770,8 @@ unicode_isupper(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
cased = 0; cased = 0;
for (; p < e; p++) { while (p < e) {
register const Py_UNICODE ch = *p; const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
return PyBool_FromLong(0); return PyBool_FromLong(0);
...@@ -7777,8 +7808,8 @@ unicode_istitle(PyUnicodeObject *self) ...@@ -7777,8 +7808,8 @@ unicode_istitle(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
cased = 0; cased = 0;
previous_is_cased = 0; previous_is_cased = 0;
for (; p < e; p++) { while (p < e) {
register const Py_UNICODE ch = *p; const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
if (previous_is_cased) if (previous_is_cased)
...@@ -7820,8 +7851,9 @@ unicode_isspace(PyUnicodeObject *self) ...@@ -7820,8 +7851,9 @@ unicode_isspace(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISSPACE(*p)) const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (!Py_UNICODE_ISSPACE(ch))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
...@@ -7849,8 +7881,8 @@ unicode_isalpha(PyUnicodeObject *self) ...@@ -7849,8 +7881,8 @@ unicode_isalpha(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISALPHA(*p)) if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
...@@ -7878,8 +7910,9 @@ unicode_isalnum(PyUnicodeObject *self) ...@@ -7878,8 +7910,9 @@ unicode_isalnum(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISALNUM(*p)) const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (!Py_UNICODE_ISALNUM(ch))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
...@@ -7907,8 +7940,8 @@ unicode_isdecimal(PyUnicodeObject *self) ...@@ -7907,8 +7940,8 @@ unicode_isdecimal(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISDECIMAL(*p)) if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
...@@ -7936,8 +7969,8 @@ unicode_isdigit(PyUnicodeObject *self) ...@@ -7936,8 +7969,8 @@ unicode_isdigit(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISDIGIT(*p)) if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
...@@ -7965,37 +7998,22 @@ unicode_isnumeric(PyUnicodeObject *self) ...@@ -7965,37 +7998,22 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(0); return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISNUMERIC(*p)) if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0); return PyBool_FromLong(0);
} }
return PyBool_FromLong(1); return PyBool_FromLong(1);
} }
static Py_UCS4
decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
{
Py_UCS4 ch;
assert(*i < size);
ch = s[(*i)++];
#ifndef Py_UNICODE_WIDE
if ((ch & 0xfffffc00) == 0xd800 &&
*i < size
&& (s[*i] & 0xFFFFFC00) == 0xDC00)
ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
#endif
return ch;
}
int int
PyUnicode_IsIdentifier(PyObject *self) PyUnicode_IsIdentifier(PyObject *self)
{ {
Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
Py_UCS4 first;
const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
const Py_UNICODE *e;
Py_UCS4 first;
/* Special case for empty strings */ /* Special case for empty strings */
if (!size) if (PyUnicode_GET_SIZE(self) == 0)
return 0; return 0;
/* PEP 3131 says that the first character must be in /* PEP 3131 says that the first character must be in
...@@ -8006,12 +8024,13 @@ PyUnicode_IsIdentifier(PyObject *self) ...@@ -8006,12 +8024,13 @@ PyUnicode_IsIdentifier(PyObject *self)
definition of XID_Start and XID_Continue, it is sufficient definition of XID_Start and XID_Continue, it is sufficient
to check just for these, except that _ must be allowed to check just for these, except that _ must be allowed
as starting an identifier. */ as starting an identifier. */
first = decode_ucs4(p, &i, size); e = p + PyUnicode_GET_SIZE(self);
first = _Py_UNICODE_NEXT(p, e);
if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
return 0; return 0;
while (i < size) while (p < e)
if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
return 0; return 0;
return 1; return 1;
} }
...@@ -8046,8 +8065,8 @@ unicode_isprintable(PyObject *self) ...@@ -8046,8 +8065,8 @@ unicode_isprintable(PyObject *self)
} }
e = p + PyUnicode_GET_SIZE(self); e = p + PyUnicode_GET_SIZE(self);
for (; p < e; p++) { while (p < e) {
if (!Py_UNICODE_ISPRINTABLE(*p)) { if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Py_RETURN_FALSE; Py_RETURN_FALSE;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment