Commit e68955cf authored by Fredrik Lundh's avatar Fredrik Lundh

needforspeed: new replace implementation by Andrew Dalke. replace is

now about 3x faster on my machine, for the replace tests from string-
bench.
parent 0c71f88f
......@@ -2379,175 +2379,623 @@ string_translate(PyStringObject *self, PyObject *args)
}
/* What follows is used for implementing replace(). Perry Stoll. */
#define FORWARD 1
#define REVERSE -1
/*
mymemfind
/* find and count characters and substrings */
strstr replacement for arbitrary blocks of memory.
/* Don't call if length < 2 */
#define Py_STRING_MATCH(target, offset, pattern, length) \
(target[offset] == pattern[0] && \
target[offset+length-1] == pattern[length-1] && \
!memcmp(target+offset+1, pattern+1, length-2) )
#define findchar(target, target_len, c) \
((char *)memchr((const void *)(target), c, target_len))
/* String ops must return a string. */
/* If the object is subclass of string, create a copy */
static PyStringObject *
return_self(PyStringObject *self)
{
if (PyString_CheckExact(self)) {
Py_INCREF(self);
return self;
}
return (PyStringObject *)PyString_FromStringAndSize(
PyString_AS_STRING(self),
PyString_GET_SIZE(self));
}
Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.
*/
static Py_ssize_t
mymemfind(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
countchar(char *target, int target_len, char c)
{
register Py_ssize_t ii;
Py_ssize_t count=0;
char *start=target;
char *end=target+target_len;
while ( (start=findchar(start, end-start, c)) != NULL ) {
count++;
start += 1;
}
/* pattern can not occur in the last pat_len-1 chars */
len -= pat_len;
return count;
}
for (ii = 0; ii <= len; ii++) {
if (mem[ii] == pat[0] && memcmp(&mem[ii], pat, pat_len) == 0) {
return ii;
}
static Py_ssize_t
findstring(char *target, Py_ssize_t target_len,
char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings always match at the first attempt */
if (pattern_len == 0)
return (direction > 0) ? start : end;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
return end;
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len))
return start;
}
return -1;
}
/*
mymemcnt
Py_ssize_t
countstring(char *target, Py_ssize_t target_len,
char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
Py_ssize_t count=0;
Return the number of distinct times PAT is found in MEM.
meaning mem=1111 and pat==11 returns 2.
mem=11111 and pat==11 also return 2.
*/
static Py_ssize_t
mymemcnt(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings match everywhere */
if (pattern_len == 0)
return target_len+1;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
count++;
end -= pattern_len-1;
}
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
count++;
start += pattern_len-1;
}
}
return count;
}
/* Algorithms for difference cases of string replacement */
/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
static PyStringObject *
replace_interleave(PyStringObject *self,
PyStringObject *to,
Py_ssize_t maxcount)
{
register Py_ssize_t offset = 0;
Py_ssize_t nfound = 0;
char *self_s, *to_s, *result_s;
Py_ssize_t self_len, to_len, result_len;
Py_ssize_t count, i, product;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
to_len = PyString_GET_SIZE(to);
/* 1 at the end plus 1 after every character */
count = self_len+1;
if (maxcount < count)
count = maxcount;
/* Check for overflow */
/* result_len = count * to_len + self_len; */
product = count * to_len;
if (product / to_len != count) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
result_len = product + self_len;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
if (! (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) )
return NULL;
while (len >= 0) {
offset = mymemfind(mem, len, pat, pat_len);
if (offset == -1)
break;
mem += offset + pat_len;
len -= offset + pat_len;
nfound++;
self_s = PyString_AS_STRING(self);
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
result_s = PyString_AS_STRING(result);
/* TODO: special case single character, which doesn't need memcpy */
/* Lay the first one down (guaranteed this will occur) */
memcpy(result_s, to_s, to_len);
result_s += to_len;
count -= 1;
for (i=0; i<count; i++) {
*result_s++ = *self_s++;
memcpy(result_s, to_s, to_len);
result_s += to_len;
}
return nfound;
/* Copy the rest of the original string */
memcpy(result_s, self_s, self_len-i);
return result;
}
/*
mymemreplace
/* Special case for deleting a single character */
/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
static PyStringObject *
replace_delete_single_character(PyStringObject *self,
char from_c, Py_ssize_t maxcount)
{
char *self_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, result_len;
Py_ssize_t count;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
count = countchar(self_s, self_len, from_c);
if (count == 0) {
return return_self(self);
}
if (count > maxcount)
count = maxcount;
result_len = self_len - count; /* from_len == 1 */
assert(result_len>=0);
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
Return a string in which all occurrences of PAT in memory STR are
replaced with SUB.
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
memcpy(result_s, start, next-start);
result_s += (next-start);
start = next+1;
}
memcpy(result_s, start, end-start);
return result;
}
If length of PAT is less than length of STR or there are no occurrences
of PAT in STR, then the original string is returned. Otherwise, a new
string is allocated here and returned.
/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
static PyStringObject *
replace_delete_substring(PyStringObject *self, PyStringObject *from,
Py_ssize_t maxcount) {
char *self_s, *from_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, from_len, result_len;
Py_ssize_t count, offset;
PyStringObject *result;
self_len = PyString_GET_SIZE(self);
self_s = PyString_AS_STRING(self);
from_len = PyString_GET_SIZE(from);
from_s = PyString_AS_STRING(from);
count = countstring(self_s, self_len,
from_s, from_len,
0, self_len, 1);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches */
return return_self(self);
}
result_len = self_len - (count * from_len);
assert (result_len>=0);
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL )
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset == -1)
break;
next = start + offset;
memcpy(result_s, start, next-start);
result_s += (next-start);
start = next+from_len;
}
memcpy(result_s, start, end-start);
return result;
}
on return, out_len is:
the length of output string, or
-1 if the input string is returned, or
unchanged if an error occurs (no memory).
/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
static PyStringObject *
replace_single_character_in_place(PyStringObject *self,
char from_c, char to_c,
Py_ssize_t maxcount)
{
char *self_s, *result_s, *start, *end, *next;
Py_ssize_t self_len;
PyStringObject *result;
/* The result string will be the same size */
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
next = findchar(self_s, self_len, from_c);
if (next == NULL) {
/* No matches; return the original string */
return return_self(self);
}
/* Need to make a new string */
result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len);
if (result == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
/* change everything in-place, starting with this one */
start = result_s + (next-self_s);
*start = to_c;
start++;
end = result_s + self_len;
while (--maxcount > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
*next = to_c;
start = next+1;
}
return result;
}
return value is:
the new string allocated locally, or
NULL if an error occurred.
*/
static char *
mymemreplace(const char *str, Py_ssize_t len, /* input string */
const char *pat, Py_ssize_t pat_len, /* pattern string to find */
const char *sub, Py_ssize_t sub_len, /* substitution string */
Py_ssize_t count, /* number of replacements */
Py_ssize_t *out_len)
/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_substring_in_place(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount)
{
char *out_s;
char *new_s;
Py_ssize_t nfound, offset, new_len;
Py_ssize_t product, delta;
if (len == 0 || (pat_len == 0 && sub_len == 0) || pat_len > len)
goto return_same;
/* find length of output string */
nfound = (pat_len > 0) ? mymemcnt(str, len, pat, pat_len) : len + 1;
if (count < 0)
count = PY_SSIZE_T_MAX;
else if (nfound > count)
nfound = count;
if (nfound == 0)
goto return_same;
delta = (sub_len - pat_len);
if (delta == 0) {
new_len = len;
} else {
product = nfound * (sub_len - pat_len);
if ((product / (sub_len - pat_len)) != nfound) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
new_len = len + product;
if (new_len < 0) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
return NULL;
}
}
char *result_s, *start, *end;
char *self_s, *from_s, *to_s;
Py_ssize_t self_len, from_len, offset;
PyStringObject *result;
/* The result string will be the same size */
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
from_s = PyString_AS_STRING(from);
from_len = PyString_GET_SIZE(from);
to_s = PyString_AS_STRING(to);
offset = findstring(self_s, self_len,
from_s, from_len,
0, self_len, FORWARD);
if (offset == -1) {
/* No matches; return the original string */
return return_self(self);
}
/* Need to make a new string */
result = (PyStringObject *) PyString_FromStringAndSize(self_s, self_len);
if (result == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
/* change everything in-place, starting with this one */
start = result_s + offset;
memcpy(start, to_s, from_len);
start += from_len;
end = result_s + self_len;
while ( --maxcount > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset==-1)
break;
memcpy(start+offset, to_s, from_len);
start += offset+from_len;
}
return result;
}
if (new_len == 0) {
/* Have to allocate something for the caller to free(). */
out_s = (char *)PyMem_MALLOC(1);
if (out_s == NULL)
return NULL;
out_s[0] = '\0';
/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_single_character(PyStringObject *self,
char from_c,
PyStringObject *to,
Py_ssize_t maxcount)
{
char *self_s, *to_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, to_len, result_len;
Py_ssize_t count, product;
PyStringObject *result;
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
count = countchar(self_s, self_len, from_c);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
}
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
/* use the difference between current and new, hence the "-1" */
/* result_len = self_len + count * (to_len-1) */
product = count * (to_len-1);
if (product / (to_len-1) != count) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
else {
assert(new_len > 0);
new_s = (char *)PyMem_MALLOC(new_len);
if (new_s == NULL)
return NULL;
out_s = new_s;
if (pat_len > 0) {
for (; nfound > 0; --nfound) {
/* find index of next instance of pattern */
offset = mymemfind(str, len, pat, pat_len);
if (offset == -1)
break;
/* copy non matching part of input string */
memcpy(new_s, str, offset);
str += offset + pat_len;
len -= offset + pat_len;
/* copy substitute into the output string */
new_s += offset;
memcpy(new_s, sub, sub_len);
new_s += sub_len;
}
/* copy any remaining values into output string */
if (len > 0)
memcpy(new_s, str, len);
}
else {
for (;;++str, --len) {
memcpy(new_s, sub, sub_len);
new_s += sub_len;
if (--nfound <= 0) {
memcpy(new_s, str, len);
break;
}
*new_s++ = *str;
}
result_len = self_len + product;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
next = findchar(start, end-start, from_c);
if (next == NULL)
break;
if (next == start) {
/* replace with the 'to' */
memcpy(result_s, to_s, to_len);
result_s += to_len;
start += 1;
} else {
/* copy the unchanged old then the 'to' */
memcpy(result_s, start, next-start);
result_s += (next-start);
memcpy(result_s, to_s, to_len);
result_s += to_len;
start = next+1;
}
}
*out_len = new_len;
return out_s;
/* Copy the remainder of the remaining string */
memcpy(result_s, start, end-start);
return result;
}
return_same:
*out_len = -1;
return (char *)str; /* cast away const */
/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
static PyStringObject *
replace_substring(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount) {
char *self_s, *from_s, *to_s, *result_s;
char *start, *next, *end;
Py_ssize_t self_len, from_len, to_len, result_len;
Py_ssize_t count, offset, product;
PyStringObject *result;
self_s = PyString_AS_STRING(self);
self_len = PyString_GET_SIZE(self);
from_s = PyString_AS_STRING(from);
from_len = PyString_GET_SIZE(from);
count = countstring(self_s, self_len,
from_s, from_len,
0, self_len, FORWARD);
if (count > maxcount)
count = maxcount;
if (count == 0) {
/* no matches, return unchanged */
return return_self(self);
}
to_s = PyString_AS_STRING(to);
to_len = PyString_GET_SIZE(to);
/* Check for overflow */
/* result_len = self_len + count * (to_len-from_len) */
product = count * (to_len-from_len);
if (product / (to_len-from_len) != count) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
result_len = self_len + product;
if (result_len < 0) {
PyErr_SetString(PyExc_OverflowError, "replace string is too long");
return NULL;
}
if ( (result = (PyStringObject *)
PyString_FromStringAndSize(NULL, result_len)) == NULL)
return NULL;
result_s = PyString_AS_STRING(result);
start = self_s;
end = self_s + self_len;
while (count-- > 0) {
offset = findstring(start, end-start,
from_s, from_len,
0, end-start, FORWARD);
if (offset == -1)
break;
next = start+offset;
if (next == start) {
/* replace with the 'to' */
memcpy(result_s, to_s, to_len);
result_s += to_len;
start += from_len;
} else {
/* copy the unchanged old then the 'to' */
memcpy(result_s, start, next-start);
result_s += (next-start);
memcpy(result_s, to_s, to_len);
result_s += to_len;
start = next+from_len;
}
}
/* Copy the remainder of the remaining string */
memcpy(result_s, start, end-start);
return result;
}
static PyStringObject *
replace(PyStringObject *self,
PyStringObject *from,
PyStringObject *to,
Py_ssize_t maxcount)
{
Py_ssize_t from_len, to_len;
if (maxcount < 0) {
maxcount = PY_SSIZE_T_MAX;
} else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
/* nothing to do; return the original string */
return return_self(self);
}
from_len = PyString_GET_SIZE(from);
to_len = PyString_GET_SIZE(to);
if (maxcount == 0 ||
(from_len == 0 && to_len == 0)) {
/* nothing to do; return the original string */
return return_self(self);
}
/* Handle zero-length special cases */
if (from_len == 0) {
/* insert the 'to' string everywhere. */
/* >>> "Python".replace("", ".") */
/* '.P.y.t.h.o.n.' */
return replace_interleave(self, to, maxcount);
}
/* Except for "".replace("", "A") == "A" there is no way beyond this */
/* point for an empty self string to generate a non-empty string */
/* Special case so the remaining code always gets a non-empty string */
if (PyString_GET_SIZE(self) == 0) {
return return_self(self);
}
if (to_len == 0) {
/* delete all occurances of 'from' string */
if (from_len == 1) {
return replace_delete_single_character(
self, PyString_AS_STRING(from)[0], maxcount);
} else {
return replace_delete_substring(self, from, maxcount);
}
}
/* Handle special case where both strings have the same length */
if (from_len == to_len) {
if (from_len == 1) {
return replace_single_character_in_place(
self,
PyString_AS_STRING(from)[0],
PyString_AS_STRING(to)[0],
maxcount);
} else {
return replace_substring_in_place(
self, from, to, maxcount);
}
}
/* Otherwise use the more generic algorithms */
if (from_len == 1) {
return replace_single_character(self, PyString_AS_STRING(from)[0],
to, maxcount);
} else {
/* len('from')>=2, len('to')>=1 */
return replace_substring(self, from, to, maxcount);
}
}
PyDoc_STRVAR(replace__doc__,
"S.replace (old, new[, count]) -> string\n\
\n\
......@@ -2558,67 +3006,42 @@ given, only the first count occurrences are replaced.");
static PyObject *
string_replace(PyStringObject *self, PyObject *args)
{
const char *str = PyString_AS_STRING(self), *sub, *repl;
char *new_s;
const Py_ssize_t len = PyString_GET_SIZE(self);
Py_ssize_t sub_len, repl_len, out_len;
Py_ssize_t count = -1;
PyObject *newobj;
PyObject *subobj, *replobj;
PyObject *from, *to;
char *tmp_s;
Py_ssize_t tmp_len;
if (!PyArg_ParseTuple(args, "OO|n:replace",
&subobj, &replobj, &count))
if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
return NULL;
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
sub_len = PyString_GET_SIZE(subobj);
if (PyString_Check(from)) {
/* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(subobj))
if (PyUnicode_Check(from))
return PyUnicode_Replace((PyObject *)self,
subobj, replobj, count);
from, to, count);
#endif
else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
else if (PyObject_AsCharBuffer(from, &tmp_s, &tmp_len))
return NULL;
if (PyString_Check(replobj)) {
repl = PyString_AS_STRING(replobj);
repl_len = PyString_GET_SIZE(replobj);
if (PyString_Check(to)) {
/* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(replobj))
else if (PyUnicode_Check(to))
return PyUnicode_Replace((PyObject *)self,
subobj, replobj, count);
from, to, count);
#endif
else if (PyObject_AsCharBuffer(replobj, &repl, &repl_len))
else if (PyObject_AsCharBuffer(to, &tmp_s, &tmp_len))
return NULL;
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
if (new_s == NULL) {
if (!PyErr_Occurred())
PyErr_NoMemory();
return NULL;
}
if (out_len == -1) {
if (PyString_CheckExact(self)) {
/* we're returning another reference to self */
newobj = (PyObject*)self;
Py_INCREF(newobj);
}
else {
newobj = PyString_FromStringAndSize(str, len);
if (newobj == NULL)
return NULL;
}
}
else {
newobj = PyString_FromStringAndSize(new_s, out_len);
PyMem_FREE(new_s);
}
return newobj;
return (PyObject *)replace((PyStringObject *) self,
(PyStringObject *) from,
(PyStringObject *) to, count);
}
/** End DALKE **/
PyDoc_STRVAR(startswith__doc__,
"S.startswith(prefix[, start[, end]]) -> bool\n\
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment