Commit 217c098f authored by Marius Wachtler's avatar Marius Wachtler

Add support for string interning

we could use this in much more places if performance and memory benchmarking shows it's an improvement.
parent 181d5850
......@@ -98,7 +98,10 @@ PyAPI_FUNC(void) _Py_ReleaseInternedStrings(void) PYSTON_NOEXCEPT;
PyAPI_FUNC(char) PyString_GetItem(PyObject *, Py_ssize_t) PYSTON_NOEXCEPT;
/* Use only if you know it's a string */
#define PyString_CHECK_INTERNED(op) (((PyStringObject *)(op))->ob_sstate)
// Pyston changes: these aren't direct macros any more [they potentially could be though]
//#define PyString_CHECK_INTERNED(op) (((PyStringObject *)(op))->ob_sstate)
PyAPI_FUNC(int) _PyString_CheckInterned(PyObject *) PYSTON_NOEXCEPT;
#define PyString_CHECK_INTERNED(op) _PyString_CheckInterned((PyObject*)op)
/* Macro, trading safety for speed */
// Pyston changes: these aren't direct macros any more [they potentially could be though]
......
......@@ -334,8 +334,6 @@ w_object(PyObject *v, WFILE *p)
}
#endif
else if (PyString_CheckExact(v)) {
// Pyston change: I think we don't need this
/*
if (p->strings && PyString_CHECK_INTERNED(v)) {
PyObject *o = PyDict_GetItem(p->strings, v);
if (o) {
......@@ -361,8 +359,6 @@ w_object(PyObject *v, WFILE *p)
else {
w_byte(TYPE_STRING, p);
}
*/
w_byte(TYPE_STRING, p);
w_pstring(PyBytes_AS_STRING(v), PyString_GET_SIZE(v), p);
}
#ifdef Py_USING_UNICODE
......
......@@ -974,16 +974,6 @@ Box* input(Box* prompt) {
throwCAPIException();
}
Box* intern(Box* str) {
if (!PyString_Check(str)) {
raiseExcHelper(TypeError, "must be string, not %s", getTypeName(str));
}
if (!PyString_CheckExact(str)) {
raiseExcHelper(TypeError, "can't intern subclass of string");
}
return PyString_InternFromString(PyString_AsString(str));
}
Box* builtinRound(Box* _number, Box* _ndigits) {
if (!isSubclass(_number->cls, float_cls))
raiseExcHelper(TypeError, "a float is required");
......@@ -1155,9 +1145,6 @@ void setupBuiltins() {
builtins_module->giveAttr("__import__", new BoxedBuiltinFunctionOrMethod(import_func, "__import__",
{ None, None, None, new BoxedInt(-1) }));
Box* intern_obj = new BoxedBuiltinFunctionOrMethod(boxRTFunction((void*)intern, UNKNOWN, 1), "intern");
builtins_module->giveAttr("intern", intern_obj);
enumerate_cls = BoxedHeapClass::create(type_cls, object_cls, &BoxedEnumerate::gcHandler, 0, 0,
sizeof(BoxedEnumerate), false, "enumerate");
enumerate_cls->giveAttr(
......
......@@ -52,7 +52,7 @@ extern "C" PyObject* string_splitlines(PyStringObject* self, PyObject* args) noe
namespace pyston {
BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n) {
BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(n != llvm::StringRef::npos, "");
if (s) {
memmove(data(), s, n);
......@@ -62,20 +62,21 @@ BoxedString::BoxedString(const char* s, size_t n) : s(storage(), n) {
}
}
BoxedString::BoxedString(llvm::StringRef lhs, llvm::StringRef rhs) : s(storage(), lhs.size() + rhs.size()) {
BoxedString::BoxedString(llvm::StringRef lhs, llvm::StringRef rhs)
: s(storage(), lhs.size() + rhs.size()), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(lhs.size() + rhs.size() != llvm::StringRef::npos, "");
memmove(data(), lhs.data(), lhs.size());
memmove(data() + lhs.size(), rhs.data(), rhs.size());
data()[lhs.size() + rhs.size()] = 0;
}
BoxedString::BoxedString(llvm::StringRef s) : s(storage(), s.size()) {
BoxedString::BoxedString(llvm::StringRef s) : s(storage(), s.size()), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(s.size() != llvm::StringRef::npos, "");
memmove(data(), s.data(), s.size());
data()[s.size()] = 0;
}
BoxedString::BoxedString(size_t n, char c) : s(storage(), n) {
BoxedString::BoxedString(size_t n, char c) : s(storage(), n), interned_state(SSTATE_NOT_INTERNED) {
RELEASE_ASSERT(n != llvm::StringRef::npos, "");
memset(data(), c, n);
data()[n] = 0;
......@@ -345,23 +346,44 @@ extern "C" Box* strAdd(BoxedString* lhs, Box* _rhs) {
}
static llvm::StringMap<Box*> interned_strings;
extern "C" PyObject* PyString_InternFromString(const char* s) noexcept {
RELEASE_ASSERT(s, "");
auto it = interned_strings.find(s);
if (it == interned_strings.end()) {
Box* b = PyGC_AddRoot(boxString(s));
assert(b);
interned_strings[s] = b;
return b;
} else {
assert(it->second);
return it->second;
auto& entry = interned_strings[s];
if (!entry) {
entry = PyGC_AddRoot(boxString(s));
// CPython returns mortal but in our current implementation they are inmortal
((BoxedString*)entry)->interned_state = SSTATE_INTERNED_IMMORTAL;
}
return entry;
}
extern "C" void PyString_InternInPlace(PyObject** o) noexcept {
Py_FatalError("unimplemented");
extern "C" void PyString_InternInPlace(PyObject** p) noexcept {
BoxedString* s = (BoxedString*)*p;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
auto& entry = interned_strings[s->s];
if (entry)
*p = entry;
else {
entry = PyGC_AddRoot(s);
// CPython returns mortal but in our current implementation they are inmortal
s->interned_state = SSTATE_INTERNED_IMMORTAL;
}
}
extern "C" int _PyString_CheckInterned(PyObject* p) noexcept {
RELEASE_ASSERT(PyString_Check(p), "");
BoxedString* s = (BoxedString*)p;
return s->interned_state;
}
/* Format codes
......@@ -2342,6 +2364,11 @@ extern "C" int _PyString_Resize(PyObject** pv, Py_ssize_t newsize) noexcept {
if (newsize == s->size())
return 0;
if (PyString_CHECK_INTERNED(s)) {
*pv = 0;
return -1;
}
if (newsize < s->size()) {
// XXX resize the box (by reallocating) smaller if it makes sense
s->s = llvm::StringRef(s->data(), newsize);
......
......@@ -278,7 +278,7 @@ extern "C" BoxedFunctionBase::BoxedFunctionBase(CLFunction* f)
this->modname = PyDict_GetItemString(getGlobalsDict(), "__name__");
this->doc = f->source->getDocString();
} else {
this->modname = boxStringPtr(&builtinStr);
this->modname = PyString_InternFromString("__builtin__");
this->doc = None;
}
......@@ -301,7 +301,7 @@ extern "C" BoxedFunctionBase::BoxedFunctionBase(CLFunction* f, std::initializer_
this->modname = PyDict_GetItemString(getGlobalsDict(), "__name__");
this->doc = f->source->getDocString();
} else {
this->modname = boxStringPtr(&builtinStr);
this->modname = PyString_InternFromString("__builtin__");
this->doc = None;
}
......
......@@ -405,6 +405,7 @@ public:
class BoxedString : public Box {
public:
llvm::StringRef s;
char interned_state;
char* data() { return const_cast<char*>(s.data()); }
size_t size() { return s.size(); }
......
......@@ -13,3 +13,8 @@ try:
print intern(StringSubclass())
except TypeError:
print "caught expected TypeError from subclassing"
s1 = "Test"
s2 = " String"
print s1+s2 is s1+s2
print intern(s1+s2) is intern(s1+s2)
import marshal
o = [-1, 1.23456789, complex(1.2, 3.4)]
o += [True, False, None]
o += ["Hello World!", u"Hello World!"]
o += ["Hello World!", u"Hello World!", intern("Interned")]
o += [{ "Key" : "Value" }, set(["Set"]), frozenset(["FrozenSet"]), (1, 2, 3), [1, 2, 3]]
for i in o:
s = marshal.dumps(i)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment