Commit 09dc34fc authored by Guido van Rossum's avatar Guido van Rossum

Compare and hash unicode objects like their UTF-8 representations.

Accept Unicode characters < 256 for 'c' format.
parent f15a29f9
...@@ -5406,33 +5406,23 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) ...@@ -5406,33 +5406,23 @@ unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
int PyUnicode_Compare(PyObject *left, int PyUnicode_Compare(PyObject *left,
PyObject *right) PyObject *right)
{ {
PyUnicodeObject *u = NULL, *v = NULL; if (PyUnicode_Check(left) && PyUnicode_Check(right))
int result; return unicode_compare((PyUnicodeObject *)left,
(PyUnicodeObject *)right);
/* Coerce the two arguments */ if ((PyString_Check(left) && PyUnicode_Check(right)) ||
u = (PyUnicodeObject *)PyUnicode_FromObject(left); (PyUnicode_Check(left) && PyString_Check(right))) {
if (u == NULL) if (PyUnicode_Check(left))
goto onError; left = _PyUnicode_AsDefaultEncodedString(left, NULL);
v = (PyUnicodeObject *)PyUnicode_FromObject(right); if (PyUnicode_Check(right))
if (v == NULL) right = _PyUnicode_AsDefaultEncodedString(right, NULL);
goto onError; assert(PyString_Check(left));
assert(PyString_Check(right));
/* Shortcut for empty or interned objects */ return PyObject_Compare(left, right);
if (v == u) {
Py_DECREF(u);
Py_DECREF(v);
return 0;
} }
PyErr_Format(PyExc_TypeError,
result = unicode_compare(u, v); "Can't compare %.100s and %.100s",
left->ob_type->tp_name,
Py_DECREF(u); right->ob_type->tp_name);
Py_DECREF(v);
return result;
onError:
Py_XDECREF(u);
Py_XDECREF(v);
return -1; return -1;
} }
...@@ -5802,30 +5792,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) ...@@ -5802,30 +5792,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
} }
static long static long
unicode_hash(PyUnicodeObject *self) unicode_hash(PyObject *self)
{ {
/* Since Unicode objects compare equal to their ASCII string /* Since Unicode objects compare equal to their UTF-8 string
counterparts, they should use the individual character values counterparts, we hash the UTF-8 string. */
as basis for their hash value. This is needed to assure that PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
strings and Unicode objects behave in the same way as return PyObject_Hash(v);
dictionary keys. */
register Py_ssize_t len;
register Py_UNICODE *p;
register long x;
if (self->hash != -1)
return self->hash;
len = PyUnicode_GET_SIZE(self);
p = PyUnicode_AS_UNICODE(self);
x = *p << 7;
while (--len >= 0)
x = (1000003*x) ^ *p++;
x ^= PyUnicode_GET_SIZE(self);
if (x == -1)
x = -2;
self->hash = x;
return x;
} }
PyDoc_STRVAR(index__doc__, PyDoc_STRVAR(index__doc__,
......
...@@ -764,8 +764,12 @@ convertsimple(PyObject *arg, const char **p_format, va_list *p_va, int flags, ...@@ -764,8 +764,12 @@ convertsimple(PyObject *arg, const char **p_format, va_list *p_va, int flags,
char *p = va_arg(*p_va, char *); char *p = va_arg(*p_va, char *);
if (PyString_Check(arg) && PyString_Size(arg) == 1) if (PyString_Check(arg) && PyString_Size(arg) == 1)
*p = PyString_AS_STRING(arg)[0]; *p = PyString_AS_STRING(arg)[0];
else if (PyUnicode_Check(arg) &&
PyUnicode_GET_SIZE(arg) == 1 &&
PyUnicode_AS_UNICODE(arg)[0] < 256)
*p = PyUnicode_AS_UNICODE(arg)[0];
else else
return converterr("char", arg, msgbuf, bufsize); return converterr("char < 256", arg, msgbuf, bufsize);
break; break;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment