Commit 15a0ed30 authored by Chris Toshok's avatar Chris Toshok

use djb's string hash function for both strings and unicode.

this keeps us from having to utf8 decode unicode objects (into BoxedStrings)
whenever we want to hash them.
parent ed5f850c
......@@ -6598,14 +6598,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
}
extern size_t unicodeHashUnboxed(PyUnicodeObject* obj);
static long
unicode_hash(PyUnicodeObject *self)
{
// Pyston change: just convert to a str and hash, since we use std::hash and not
// CPython's hashing algorithm they duplicated here:
PyObject* str = PyUnicode_AsEncodedString((PyObject*)self, "utf8", "replace");
return str->ob_type->tp_hash(str);
return unicodeHashUnboxed(self);
#if 0
/* Since Unicode objects compare equal to their ASCII string
counterparts, they should use the individual character values
......
......@@ -128,7 +128,7 @@ static Box* (*callattrInternal3)(Box*, const std::string*, LookupScope, CallRewr
size_t PyHasher::operator()(Box* b) const {
if (b->cls == str_cls) {
std::hash<std::string> H;
StringHash<std::string> H;
return H(static_cast<BoxedString*>(b)->s);
}
......
......@@ -1478,10 +1478,20 @@ failed:
return NULL;
}
extern "C" size_t unicodeHashUnboxed(PyUnicodeObject* self) {
if (self->hash != -1)
return self->hash;
Py_ssize_t len = PyUnicode_GET_SIZE(self);
Py_UNICODE* p = PyUnicode_AS_UNICODE(self);
pyston::StringHash<Py_UNICODE> H;
return H(p, len);
}
extern "C" Box* strHash(BoxedString* self) {
assert(isSubclass(self->cls, str_cls));
std::hash<std::string> H;
StringHash<std::string> H;
return boxInt(H(self->s));
}
......
......@@ -385,6 +385,37 @@ class BoxedUnicode : public Box {
// TODO implementation
};
template <typename T> struct StringHash {
size_t operator()(const T* str) {
size_t hash = 5381;
T c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
size_t operator()(const T* str, int len) {
size_t hash = 5381;
T c;
while (--len >= 0) {
c = *str++;
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
}
return hash;
}
};
template <> struct StringHash<std::string> {
size_t operator()(const std::string& str) {
StringHash<char> H;
return H(&str[0], str.size());
}
};
class BoxedInstanceMethod : public Box {
public:
Box** in_weakreflist;
......
......@@ -24,6 +24,7 @@ print u'a' in c.__dict__
print u'' == ''
print '' == u''
print hash(u'') == hash('')
print hash(u'hello world') == hash('hello world')
print "Hello " + u" World"
print u"Hello " + " World"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment