Commit 3486e84c authored by Chris Toshok's avatar Chris Toshok

Merge pull request #427 from toshok/unicode-hash

use the same hash function for both unicode and string objects
parents ed5f850c 15a0ed30
...@@ -6598,14 +6598,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) ...@@ -6598,14 +6598,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
} }
extern size_t unicodeHashUnboxed(PyUnicodeObject* obj);
static long static long
unicode_hash(PyUnicodeObject *self) unicode_hash(PyUnicodeObject *self)
{ {
// Pyston change: just convert to a str and hash, since we use std::hash and not return unicodeHashUnboxed(self);
// CPython's hashing algorithm they duplicated here:
PyObject* str = PyUnicode_AsEncodedString((PyObject*)self, "utf8", "replace");
return str->ob_type->tp_hash(str);
#if 0 #if 0
/* Since Unicode objects compare equal to their ASCII string /* Since Unicode objects compare equal to their ASCII string
counterparts, they should use the individual character values counterparts, they should use the individual character values
......
...@@ -128,7 +128,7 @@ static Box* (*callattrInternal3)(Box*, const std::string*, LookupScope, CallRewr ...@@ -128,7 +128,7 @@ static Box* (*callattrInternal3)(Box*, const std::string*, LookupScope, CallRewr
size_t PyHasher::operator()(Box* b) const { size_t PyHasher::operator()(Box* b) const {
if (b->cls == str_cls) { if (b->cls == str_cls) {
std::hash<std::string> H; StringHash<std::string> H;
return H(static_cast<BoxedString*>(b)->s); return H(static_cast<BoxedString*>(b)->s);
} }
......
...@@ -1478,10 +1478,20 @@ failed: ...@@ -1478,10 +1478,20 @@ failed:
return NULL; return NULL;
} }
extern "C" size_t unicodeHashUnboxed(PyUnicodeObject* self) {
if (self->hash != -1)
return self->hash;
Py_ssize_t len = PyUnicode_GET_SIZE(self);
Py_UNICODE* p = PyUnicode_AS_UNICODE(self);
pyston::StringHash<Py_UNICODE> H;
return H(p, len);
}
extern "C" Box* strHash(BoxedString* self) { extern "C" Box* strHash(BoxedString* self) {
assert(isSubclass(self->cls, str_cls)); assert(isSubclass(self->cls, str_cls));
std::hash<std::string> H; StringHash<std::string> H;
return boxInt(H(self->s)); return boxInt(H(self->s));
} }
......
...@@ -385,6 +385,37 @@ class BoxedUnicode : public Box { ...@@ -385,6 +385,37 @@ class BoxedUnicode : public Box {
// TODO implementation // TODO implementation
}; };
template <typename T> struct StringHash {
size_t operator()(const T* str) {
size_t hash = 5381;
T c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
size_t operator()(const T* str, int len) {
size_t hash = 5381;
T c;
while (--len >= 0) {
c = *str++;
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
}
return hash;
}
};
template <> struct StringHash<std::string> {
size_t operator()(const std::string& str) {
StringHash<char> H;
return H(&str[0], str.size());
}
};
class BoxedInstanceMethod : public Box { class BoxedInstanceMethod : public Box {
public: public:
Box** in_weakreflist; Box** in_weakreflist;
......
...@@ -24,6 +24,7 @@ print u'a' in c.__dict__ ...@@ -24,6 +24,7 @@ print u'a' in c.__dict__
print u'' == '' print u'' == ''
print '' == u'' print '' == u''
print hash(u'') == hash('') print hash(u'') == hash('')
print hash(u'hello world') == hash('hello world')
print "Hello " + u" World" print "Hello " + u" World"
print u"Hello " + " World" print u"Hello " + " World"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment