Commit b9f52061 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge pull request #508 from undingen/intern

Add support for string interning
parents 45b078d5 217c098f
......@@ -98,7 +98,10 @@ PyAPI_FUNC(void) _Py_ReleaseInternedStrings(void) PYSTON_NOEXCEPT;
PyAPI_FUNC(char) PyString_GetItem(PyObject *, Py_ssize_t) PYSTON_NOEXCEPT;
/* Use only if you know it's a string */
#define PyString_CHECK_INTERNED(op) (((PyStringObject *)(op))->ob_sstate)
// Pyston changes: these aren't direct macros any more [they potentially could be though]
//#define PyString_CHECK_INTERNED(op) (((PyStringObject *)(op))->ob_sstate)
PyAPI_FUNC(int) _PyString_CheckInterned(PyObject *) PYSTON_NOEXCEPT;
#define PyString_CHECK_INTERNED(op) _PyString_CheckInterned((PyObject*)op)
/* Macro, trading safety for speed */
// Pyston changes: these aren't direct macros any more [they potentially could be though]
......
......@@ -334,8 +334,6 @@ w_object(PyObject *v, WFILE *p)
}
#endif
else if (PyString_CheckExact(v)) {
// Pyston change: I think we don't need this
/*
if (p->strings && PyString_CHECK_INTERNED(v)) {
PyObject *o = PyDict_GetItem(p->strings, v);
if (o) {
......@@ -361,8 +359,6 @@ w_object(PyObject *v, WFILE *p)
else {
w_byte(TYPE_STRING, p);
}
*/
w_byte(TYPE_STRING, p);
w_pstring(PyBytes_AS_STRING(v), PyString_GET_SIZE(v), p);
}
#ifdef Py_USING_UNICODE
......
......@@ -52,7 +52,7 @@ extern "C" PyObject* string_splitlines(PyStringObject* self, PyObject* args) noe
namespace pyston {
BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n) {
BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(n != llvm::StringRef::npos, "");
if (s) {
memmove(data(), s, n);
......@@ -62,20 +62,21 @@ BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n) {
}
}
BoxedString::BoxedString(llvm::StringRef lhs, llvm::StringRef rhs) : s(storage(), lhs.size() + rhs.size()) {
BoxedString::BoxedString(llvm::StringRef lhs, llvm::StringRef rhs)
: s(storage(), lhs.size() + rhs.size()), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(lhs.size() + rhs.size() != llvm::StringRef::npos, "");
memmove(data(), lhs.data(), lhs.size());
memmove(data() + lhs.size(), rhs.data(), rhs.size());
data()[lhs.size() + rhs.size()] = 0;
}
BoxedString::BoxedString(llvm::StringRef s) : s(storage(), s.size()) {
BoxedString::BoxedString(llvm::StringRef s) : s(storage(), s.size()), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(s.size() != llvm::StringRef::npos, "");
memmove(data(), s.data(), s.size());
data()[s.size()] = 0;
}
BoxedString::BoxedString(size_t n, char c) : s(storage(), n) {
BoxedString::BoxedString(size_t n, char c) : s(storage(), n), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(n != llvm::StringRef::npos, "");
memset(data(), c, n);
data()[n] = 0;
......@@ -345,23 +346,44 @@ extern "C" Box* strAdd(BoxedString* lhs, Box* _rhs) {
}
static llvm::StringMap<Box*> interned_strings;
extern "C" PyObject* PyString_InternFromString(const char* s) noexcept {
RELEASE_ASSERT(s, "");
auto it = interned_strings.find(s);
if (it == interned_strings.end()) {
Box* b = PyGC_AddRoot(boxString(s));
assert(b);
interned_strings[s] = b;
return b;
} else {
assert(it->second);
return it->second;
auto& entry = interned_strings[s];
if (!entry) {
entry = PyGC_AddRoot(boxString(s));
// CPython returns mortal but in our current implementation they are inmortal
((BoxedString*)entry)->interned_state = SSTATE_INTERNED_IMMORTAL;
}
return entry;
}
extern "C" void PyString_InternInPlace(PyObject** o) noexcept {
Py_FatalError("unimplemented");
extern "C" void PyString_InternInPlace(PyObject** p) noexcept {
BoxedString* s = (BoxedString*)*p;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
auto& entry = interned_strings[s->s];
if (entry)
*p = entry;
else {
entry = PyGC_AddRoot(s);
// CPython returns mortal but in our current implementation they are inmortal
s->interned_state = SSTATE_INTERNED_IMMORTAL;
}
}
extern "C" int _PyString_CheckInterned(PyObject* p) noexcept {
RELEASE_ASSERT(PyString_Check(p), "");
BoxedString* s = (BoxedString*)p;
return s->interned_state;
}
/* Format codes
......@@ -2342,6 +2364,11 @@ extern "C" int _PyString_Resize(PyObject** pv, Py_ssize_t newsize) noexcept {
if (newsize == s->size())
return 0;
if (PyString_CHECK_INTERNED(s)) {
*pv = 0;
return -1;
}
if (newsize < s->size()) {
// XXX resize the box (by reallocating) smaller if it makes sense
s->s = llvm::StringRef(s->data(), newsize);
......
......@@ -278,7 +278,7 @@ extern "C" BoxedFunctionBase::BoxedFunctionBase(CLFunction* f)
this->modname = PyDict_GetItemString(getGlobalsDict(), "__name__");
this->doc = f->source->getDocString();
} else {
this->modname = boxStringPtr(&builtinStr);
this->modname = PyString_InternFromString("__builtin__");
this->doc = None;
}
......@@ -301,7 +301,7 @@ extern "C" BoxedFunctionBase::BoxedFunctionBase(CLFunction* f, std::initializer_
this->modname = PyDict_GetItemString(getGlobalsDict(), "__name__");
this->doc = f->source->getDocString();
} else {
this->modname = boxStringPtr(&builtinStr);
this->modname = PyString_InternFromString("__builtin__");
this->doc = None;
}
......
......@@ -420,6 +420,7 @@ public:
class BoxedString : public Box {
public:
llvm::StringRef s;
char interned_state;
char* data() { return const_cast<char*>(s.data()); }
size_t size() { return s.size(); }
......
try:
print intern(123)
except TypeError:
print "caught expected TypeError"
print intern("abcd")
class StringSubclass(str):
pass
# CPython does not allow interning on subclasses of str
try:
print intern(StringSubclass())
except TypeError:
print "caught expected TypeError from subclassing"
s1 = "Test"
s2 = " String"
print s1+s2 is s1+s2
print intern(s1+s2) is intern(s1+s2)
import marshal
o = [-1, 1.23456789, complex(1.2, 3.4)]
o += [True, False, None]
o += ["Hello World!", u"Hello World!"]
o += ["Hello World!", u"Hello World!", intern("Interned")]
o += [{ "Key" : "Value" }, set(["Set"]), frozenset(["FrozenSet"]), (1, 2, 3), [1, 2, 3]]
for i in o:
s = marshal.dumps(i)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment