Merged revisions 77461 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r77461 | antoine.pitrou | 2010-01-13 08:55:48 +0100 (mer., 13 janv. 2010) | 5 lines Issue #7622: Improve the split(), rsplit(), splitlines() and replace() methods of bytes, bytearray and unicode objects by using a common implementation based on stringlib's fast search. Patch by Florent Xicluna. ........

Merged revisions 77461 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r77461 | antoine.pitrou | 2010-01-13 08:55:48 +0100 (mer., 13 janv. 2010) | 5 lines Issue #7622: Improve the split(), rsplit(), splitlines() and replace() methods of bytes, bytearray and unicode objects by using a common implementation based on stringlib's fast search. Patch by Florent Xicluna. ........
26447c09 · Antoine Pitrou · 67a22e9d · 26447c09 · 26447c09 · 26447c09
Commit 26447c09 authored Jan 13, 2010 by Antoine Pitrou
16 changed files
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -582,6 +582,7 @@ BYTESTR_DEPS = \
 		$(srcdir)/Objects/stringlib/fastsearch.h \
 		$(srcdir)/Objects/stringlib/find.h \
 		$(srcdir)/Objects/stringlib/partition.h \
+		$(srcdir)/Objects/stringlib/split.h \
 		$(srcdir)/Objects/stringlib/stringdefs.h \
 		$(srcdir)/Objects/stringlib/string_format.h \
 		$(srcdir)/Objects/stringlib/transmogrify.h \

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1?
 Core and Builtins
 -----------------
+- Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
+  methods of bytes, bytearray and unicode objects by using a common
+  implementation based on stringlib's fast search.  Patch by Florent Xicluna.
 - Issue #7632: Fix a crash in dtoa.c that occurred in debug builds
  when parsing certain long numeric strings corresponding to subnormal
  values.  Also fix a number of bugs in dtoa.c that could lead to

--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
--- a/Objects/stringlib/count.h
+++ b/Objects/stringlib/count.h
@@ -9,28 +9,22 @@
 Py_LOCAL_INLINE(Py_ssize_t)
 stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
-                const STRINGLIB_CHAR* sub, Py_ssize_t sub_len)
+                const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
+                Py_ssize_t maxcount)
 {
    Py_ssize_t count;
    if (str_len < 0)
        return 0; /* start > len(str) */
    if (sub_len == 0)
-        return str_len + 1;
+        return (str_len < maxcount) ? str_len + 1 : maxcount;
-    count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT);
+    count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT);
    if (count < 0)
-        count = 0; /* no match */
+        return 0; /* no match */
    return count;
 }
 #endif
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
--- a/Objects/stringlib/ctype.h
+++ b/Objects/stringlib/ctype.h
@@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self)
                    STRINGLIB_LEN(self));
    return newobj;
 }
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -18,10 +18,13 @@
 #define FAST_SEARCH 1
 #define FAST_RSEARCH 2
+#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
+#define BLOOM(mask, ch)     ((mask &  (1 << ((ch) & (LONG_BIT - 1)))))
 Py_LOCAL_INLINE(Py_ssize_t)
 fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
           const STRINGLIB_CHAR* p, Py_ssize_t m,
-           int mode)
+           Py_ssize_t maxcount, int mode)
 {
    long mask;
    Py_ssize_t skip, count = 0;
@@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
    w = n - m;
-    if (w < 0)
+    if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
        return -1;
    /* look for special cases */
@@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
        /* use special case for 1-character strings */
        if (mode == FAST_COUNT) {
            for (i = 0; i < n; i++)
-                if (s[i] == p[0])
+                if (s[i] == p[0]) {
                    count++;
+                    if (count == maxcount)
+                        return maxcount;
+                }
            return count;
        } else if (mode == FAST_SEARCH) {
            for (i = 0; i < n; i++)
@@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
    mlast = m - 1;
    skip = mlast - 1;
+    mask = 0;
    if (mode != FAST_RSEARCH) {
        /* create compressed boyer-moore delta 1 table */
        /* process pattern[:-1] */
-        for (mask = i = 0; i < mlast; i++) {
+        for (i = 0; i < mlast; i++) {
-            mask |= (1 << (p[i] & 0x1F));
+            BLOOM_ADD(mask, p[i]);
            if (p[i] == p[mlast])
                skip = mlast - i - 1;
        }
        /* process pattern[-1] outside the loop */
-        mask |= (1 << (p[mlast] & 0x1F));
+        BLOOM_ADD(mask, p[mlast]);
        for (i = 0; i <= w; i++) {
            /* note: using mlast in the skip path slows things down on x86 */
@@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
                    if (mode != FAST_COUNT)
                        return i;
                    count++;
+                    if (count == maxcount)
+                        return maxcount;
                    i = i + mlast;
                    continue;
                }
                /* miss: check if next character is part of pattern */
-                if (!(mask & (1 << (s[i+m] & 0x1F))))
+                if (!BLOOM(mask, s[i+m]))
                    i = i + m;
                else
                    i = i + skip;
            } else {
                /* skip: check if next character is part of pattern */
-                if (!(mask & (1 << (s[i+m] & 0x1F))))
+                if (!BLOOM(mask, s[i+m]))
                    i = i + m;
            }
        }
@@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
        /* create compressed boyer-moore delta 1 table */
        /* process pattern[0] outside the loop */
-        mask = (1 << (p[0] & 0x1F));
+        BLOOM_ADD(mask, p[0]);
        /* process pattern[:0:-1] */
        for (i = mlast; i > 0; i--) {
-            mask |= (1 << (p[i] & 0x1F));
+            BLOOM_ADD(mask, p[i]);
            if (p[i] == p[0])
                skip = i - 1;
        }
@@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
                    /* got a match! */
                    return i;
                /* miss: check if previous character is part of pattern */
-                if (!(mask & (1 << (s[i-1] & 0x1F))))
+                if (!BLOOM(mask, s[i-1]))
                    i = i - m;
                else
                    i = i - skip;
            } else {
                /* skip: check if previous character is part of pattern */
-                if (!(mask & (1 << (s[i-1] & 0x1F))))
+                if (!BLOOM(mask, s[i-1]))
                    i = i - m;
            }
        }
@@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
 }
 #endif
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
--- a/Objects/stringlib/find.h
+++ b/Objects/stringlib/find.h
@@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    if (sub_len == 0)
        return offset;
-    pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH);
+    pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH);
    if (pos >= 0)
        pos += offset;
@@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    if (sub_len == 0)
        return str_len + offset;
-    pos = fastsearch(str, str_len, sub, sub_len, FAST_RSEARCH);
+    pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH);
    if (pos >= 0)
        pos += offset;
@@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
    return pos;
 }
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len)         \
+    if (end > len)                              \
+        end = len;                              \
+    else if (end < 0) {                         \
+        end += len;                             \
+        if (end < 0)                            \
+            end = 0;                            \
+    }                                           \
+    if (start < 0) {                            \
+        start += len;                           \
+        if (start < 0)                          \
+            start = 0;                          \
+    }
 Py_LOCAL_INLINE(Py_ssize_t)
 stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
                     const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
                     Py_ssize_t start, Py_ssize_t end)
 {
-    if (start < 0)
+    ADJUST_INDICES(start, end, str_len);
-        start += str_len;
-    if (start < 0)
-        start = 0;
-    if (end > str_len)
-        end = str_len;
-    if (end < 0)
-        end += str_len;
-    if (end < 0)
-        end = 0;
    return stringlib_find(str + start, end - start, sub, sub_len, start);
 }
@@ -71,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
                      const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
                      Py_ssize_t start, Py_ssize_t end)
 {
-    if (start < 0)
+    ADJUST_INDICES(start, end, str_len);
-        start += str_len;
-    if (start < 0)
-        start = 0;
-    if (end > str_len)
-        end = str_len;
-    if (end < 0)
-        end += str_len;
-    if (end < 0)
-        end = 0;
    return stringlib_rfind(str + start, end - start, sub, sub_len, start);
 }
@@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub)
        ) != -1;
 }
-#endif /* STRINGLIB_STR */
+#endif /* STRINGLIB_WANT_CONTAINS_OBJ */
-#ifdef FROM_UNICODE
+#if STRINGLIB_IS_UNICODE
 /*
 This function is a helper for the "find" family (find, rfind, index,
@@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring,
    return 1;
 }
-#endif /* FROM_UNICODE */
+#endif /* STRINGLIB_IS_UNICODE */
 #endif /* STRINGLIB_FIND_H */
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
--- a/Objects/stringlib/partition.h
+++ b/Objects/stringlib/partition.h
@@ -8,33 +8,39 @@
 #endif
 Py_LOCAL_INLINE(PyObject*)
-stringlib_partition(
+stringlib_partition(PyObject* str_obj,
-    PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+                    const STRINGLIB_CHAR* str, Py_ssize_t str_len,
-    PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
+                    PyObject* sep_obj,
-    )
+                    const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
 {
    PyObject* out;
    Py_ssize_t pos;
    if (sep_len == 0) {
        PyErr_SetString(PyExc_ValueError, "empty separator");
-	return NULL;
+        return NULL;
    }
    out = PyTuple_New(3);
    if (!out)
-	return NULL;
+        return NULL;
-    pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH);
+    pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH);
    if (pos < 0) {
-	Py_INCREF(str_obj);
+#if STRINGLIB_MUTABLE
-	PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
+        PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len));
-	Py_INCREF(STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
-	PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0));
-	Py_INCREF(STRINGLIB_EMPTY);
+#else
-	PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
+        Py_INCREF(str_obj);
-	return out;
+        PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
+        Py_INCREF(STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+        Py_INCREF(STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
+#endif
+        return out;
    }
    PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
@@ -44,41 +50,47 @@ stringlib_partition(
    PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
    if (PyErr_Occurred()) {
-	Py_DECREF(out);
+        Py_DECREF(out);
-	return NULL;
+        return NULL;
    }
    return out;
 }
 Py_LOCAL_INLINE(PyObject*)
-stringlib_rpartition(
+stringlib_rpartition(PyObject* str_obj,
-    PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
+                     const STRINGLIB_CHAR* str, Py_ssize_t str_len,
-    PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
+                     PyObject* sep_obj,
-    )
+                     const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
 {
    PyObject* out;
    Py_ssize_t pos;
    if (sep_len == 0) {
        PyErr_SetString(PyExc_ValueError, "empty separator");
-	return NULL;
+        return NULL;
    }
    out = PyTuple_New(3);
    if (!out)
-	return NULL;
+        return NULL;
-    pos = fastsearch(str, str_len, sep, sep_len, FAST_RSEARCH);
+    pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH);
    if (pos < 0) {
-	Py_INCREF(STRINGLIB_EMPTY);
+#if STRINGLIB_MUTABLE
-	PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0));
-	Py_INCREF(STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
-	PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len));
-	Py_INCREF(str_obj);
+#else
-	PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
+        Py_INCREF(STRINGLIB_EMPTY);
-	return out;
+        PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
+        Py_INCREF(STRINGLIB_EMPTY);
+        PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
+        Py_INCREF(str_obj);
+        PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
+#endif
+        return out;
    }
    PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
@@ -88,18 +100,11 @@ stringlib_rpartition(
    PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
    if (PyErr_Occurred()) {
-	Py_DECREF(out);
+        Py_DECREF(out);
-	return NULL;
+        return NULL;
    }
    return out;
 }
 #endif
-/*
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
--- a/Objects/stringlib/split.h
+++ b/Objects/stringlib/split.h
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@@ -11,6 +11,8 @@
 #define STRINGLIB_TYPE_NAME      "string"
 #define STRINGLIB_PARSE_CODE     "S"
 #define STRINGLIB_EMPTY          nullstring
+#define STRINGLIB_ISSPACE        Py_ISSPACE
+#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
 #define STRINGLIB_ISDECIMAL(x)   ((x >= '0') && (x <= '9'))
 #define STRINGLIB_TODECIMAL(x)   (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1)
 #define STRINGLIB_TOUPPER        Py_TOUPPER

--- a/Objects/stringlib/transmogrify.h
+++ b/Objects/stringlib/transmogrify.h
 /* NOTE: this API is -ONLY- for use with single byte character strings. */
 /* Do not use it with Unicode. */
-#include "bytes_methods.h"
-#ifndef STRINGLIB_MUTABLE
-#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0"
-#define STRINGLIB_MUTABLE 0
-#endif
 /* the more complicated methods.  parts of these should be pulled out into the
   shared code in bytes_methods.c to cut down on duplicate code bloat.  */
@@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args)
    return (PyObject*) s;
 }
-#define _STRINGLIB_SPLIT_APPEND(data, left, right)		\
-	str = STRINGLIB_NEW((data) + (left),	                \
-					 (right) - (left));	\
-	if (str == NULL)					\
-		goto onError;					\
-	if (PyList_Append(list, str)) {				\
-		Py_DECREF(str);					\
-		goto onError;					\
-	}							\
-	else							\
-		Py_DECREF(str);
-PyDoc_STRVAR(splitlines__doc__,
-"B.splitlines([keepends]) -> list of lines\n\
-\n\
-Return a list of the lines in B, breaking at line boundaries.\n\
-Line breaks are not included in the resulting list unless keepends\n\
-is given and true.");
-static PyObject*
-stringlib_splitlines(PyObject *self, PyObject *args)
-{
-    register Py_ssize_t i;
-    register Py_ssize_t j;
-    Py_ssize_t len;
-    int keepends = 0;
-    PyObject *list;
-    PyObject *str;
-    char *data;
-    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
-        return NULL;
-    data = STRINGLIB_STR(self);
-    len = STRINGLIB_LEN(self);
-    /* This does not use the preallocated list because splitlines is
-       usually run with hundreds of newlines.  The overhead of
-       switching between PyList_SET_ITEM and append causes about a
-       2-3% slowdown for that common case.  A smarter implementation
-       could move the if check out, so the SET_ITEMs are done first
-       and the appends only done when the prealloc buffer is full.
-       That's too much work for little gain.*/
-    list = PyList_New(0);
-    if (!list)
-        goto onError;
-    for (i = j = 0; i < len; ) {
-	Py_ssize_t eol;
-	/* Find a line and append it */
-	while (i < len && data[i] != '\n' && data[i] != '\r')
-	    i++;
-	/* Skip the line break reading CRLF as one line break */
-	eol = i;
-	if (i < len) {
-	    if (data[i] == '\r' && i + 1 < len &&
-		data[i+1] == '\n')
-		i += 2;
-	    else
-		i++;
-	    if (keepends)
-		eol = i;
-	}
-	_STRINGLIB_SPLIT_APPEND(data, j, eol);
-	j = i;
-    }
-    if (j < len) {
-	_STRINGLIB_SPLIT_APPEND(data, j, len);
-    }
-    return list;
- onError:
-    Py_XDECREF(list);
-    return NULL;
-}
-#undef _STRINGLIB_SPLIT_APPEND
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@@ -11,6 +11,8 @@
 #define STRINGLIB_TYPE_NAME      "unicode"
 #define STRINGLIB_PARSE_CODE     "U"
 #define STRINGLIB_EMPTY          unicode_empty
+#define STRINGLIB_ISSPACE        Py_UNICODE_ISSPACE
+#define STRINGLIB_ISLINEBREAK    BLOOM_LINEBREAK
 #define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL
 #define STRINGLIB_TODECIMAL      Py_UNICODE_TODECIMAL
 #define STRINGLIB_TOUPPER        Py_UNICODE_TOUPPER

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
--- a/PC/VS8.0/pythoncore.vcproj
+++ b/PC/VS8.0/pythoncore.vcproj
@@ -1490,6 +1490,10 @@
 				RelativePath="..\..\Objects\sliceobject.c"
 				>
 			</File>
+			<File
+				RelativePath="..\..\Objects\stringlib\split.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\Objects\structseq.c"
 				>

--- a/PCbuild/pythoncore.vcproj
+++ b/PCbuild/pythoncore.vcproj
@@ -1495,6 +1495,10 @@
 				RelativePath="..\Objects\sliceobject.c"
 				>
 			</File>
+			<File
+				RelativePath="..\Objects\stringlib\split.h"
+				>
+			</File>
 			<File
 				RelativePath="..\Objects\structseq.c"
 				>