Commit 26447c09 authored by Antoine Pitrou's avatar Antoine Pitrou

Merged revisions 77461 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r77461 | antoine.pitrou | 2010-01-13 08:55:48 +0100 (mer., 13 janv. 2010) | 5 lines

  Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
  methods of bytes, bytearray and unicode objects by using a common
  implementation based on stringlib's fast search.  Patch by Florent Xicluna.
........
parent 67a22e9d
...@@ -582,6 +582,7 @@ BYTESTR_DEPS = \ ...@@ -582,6 +582,7 @@ BYTESTR_DEPS = \
$(srcdir)/Objects/stringlib/fastsearch.h \ $(srcdir)/Objects/stringlib/fastsearch.h \
$(srcdir)/Objects/stringlib/find.h \ $(srcdir)/Objects/stringlib/find.h \
$(srcdir)/Objects/stringlib/partition.h \ $(srcdir)/Objects/stringlib/partition.h \
$(srcdir)/Objects/stringlib/split.h \
$(srcdir)/Objects/stringlib/stringdefs.h \ $(srcdir)/Objects/stringlib/stringdefs.h \
$(srcdir)/Objects/stringlib/string_format.h \ $(srcdir)/Objects/stringlib/string_format.h \
$(srcdir)/Objects/stringlib/transmogrify.h \ $(srcdir)/Objects/stringlib/transmogrify.h \
......
...@@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1? ...@@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
methods of bytes, bytearray and unicode objects by using a common
implementation based on stringlib's fast search. Patch by Florent Xicluna.
- Issue #7632: Fix a crash in dtoa.c that occurred in debug builds - Issue #7632: Fix a crash in dtoa.c that occurred in debug builds
when parsing certain long numeric strings corresponding to subnormal when parsing certain long numeric strings corresponding to subnormal
values. Also fix a number of bugs in dtoa.c that could lead to values. Also fix a number of bugs in dtoa.c that could lead to
......
...@@ -1039,14 +1039,16 @@ bytearray_dealloc(PyByteArrayObject *self) ...@@ -1039,14 +1039,16 @@ bytearray_dealloc(PyByteArrayObject *self)
#define STRINGLIB_STR PyByteArray_AS_STRING #define STRINGLIB_STR PyByteArray_AS_STRING
#define STRINGLIB_NEW PyByteArray_FromStringAndSize #define STRINGLIB_NEW PyByteArray_FromStringAndSize
#define STRINGLIB_EMPTY nullbytes #define STRINGLIB_EMPTY nullbytes
#define STRINGLIB_ISSPACE Py_ISSPACE
#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
#define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact #define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact
#define STRINGLIB_MUTABLE 1 #define STRINGLIB_MUTABLE 1
#define FROM_BYTEARRAY 1
#include "stringlib/fastsearch.h" #include "stringlib/fastsearch.h"
#include "stringlib/count.h" #include "stringlib/count.h"
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/partition.h" #include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/ctype.h" #include "stringlib/ctype.h"
#include "stringlib/transmogrify.h" #include "stringlib/transmogrify.h"
...@@ -1054,21 +1056,20 @@ bytearray_dealloc(PyByteArrayObject *self) ...@@ -1054,21 +1056,20 @@ bytearray_dealloc(PyByteArrayObject *self)
/* The following Py_LOCAL_INLINE and Py_LOCAL functions /* The following Py_LOCAL_INLINE and Py_LOCAL functions
were copied from the old char* style string object. */ were copied from the old char* style string object. */
Py_LOCAL_INLINE(void) /* helper macro to fixup start/end slice values */
_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) #define ADJUST_INDICES(start, end, len) \
{ if (end > len) \
if (*end > len) end = len; \
*end = len; else if (end < 0) { \
else if (*end < 0) end += len; \
*end += len; if (end < 0) \
if (*end < 0) end = 0; \
*end = 0; } \
if (*start < 0) if (start < 0) { \
*start += len; start += len; \
if (*start < 0) if (start < 0) \
*start = 0; start = 0; \
} }
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir) bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
...@@ -1136,10 +1137,10 @@ bytearray_count(PyByteArrayObject *self, PyObject *args) ...@@ -1136,10 +1137,10 @@ bytearray_count(PyByteArrayObject *self, PyObject *args)
if (_getbuffer(sub_obj, &vsub) < 0) if (_getbuffer(sub_obj, &vsub) < 0)
return NULL; return NULL;
_adjust_indices(&start, &end, PyByteArray_GET_SIZE(self)); ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self));
count_obj = PyLong_FromSsize_t( count_obj = PyLong_FromSsize_t(
stringlib_count(str + start, end - start, vsub.buf, vsub.len) stringlib_count(str + start, end - start, vsub.buf, vsub.len, PY_SSIZE_T_MAX)
); );
PyBuffer_Release(&vsub); PyBuffer_Release(&vsub);
return count_obj; return count_obj;
...@@ -1247,7 +1248,7 @@ _bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start ...@@ -1247,7 +1248,7 @@ _bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start
if (_getbuffer(substr, &vsubstr) < 0) if (_getbuffer(substr, &vsubstr) < 0)
return -1; return -1;
_adjust_indices(&start, &end, len); ADJUST_INDICES(start, end, len);
if (direction < 0) { if (direction < 0) {
/* startswith */ /* startswith */
...@@ -1459,20 +1460,11 @@ bytearray_maketrans(PyObject *null, PyObject *args) ...@@ -1459,20 +1460,11 @@ bytearray_maketrans(PyObject *null, PyObject *args)
} }
#define FORWARD 1
#define REVERSE -1
/* find and count characters and substrings */ /* find and count characters and substrings */
#define findchar(target, target_len, c) \ #define findchar(target, target_len, c) \
((char *)memchr((const void *)(target), c, target_len)) ((char *)memchr((const void *)(target), c, target_len))
/* Don't call if length < 2 */
#define Py_STRING_MATCH(target, offset, pattern, length) \
(target[offset] == pattern[0] && \
target[offset+length-1] == pattern[length-1] && \
!memcmp(target+offset+1, pattern+1, length-2) )
/* Bytes ops must return a string, create a copy */ /* Bytes ops must return a string, create a copy */
Py_LOCAL(PyByteArrayObject *) Py_LOCAL(PyByteArrayObject *)
...@@ -1500,93 +1492,6 @@ countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount ...@@ -1500,93 +1492,6 @@ countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount
return count; return count;
} }
Py_LOCAL(Py_ssize_t)
findstring(const char *target, Py_ssize_t target_len,
const char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings always match at the first attempt */
if (pattern_len == 0)
return (direction > 0) ? start : end;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
return end;
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len))
return start;
}
return -1;
}
Py_LOCAL_INLINE(Py_ssize_t)
countstring(const char *target, Py_ssize_t target_len,
const char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction, Py_ssize_t maxcount)
{
Py_ssize_t count=0;
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings match everywhere */
if (pattern_len == 0 || maxcount == 0) {
if (target_len+1 < maxcount)
return target_len+1;
return maxcount;
}
end -= pattern_len;
if (direction < 0) {
for (; (end >= start); end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
count++;
if (--maxcount <= 0) break;
end -= pattern_len-1;
}
} else {
for (; (start <= end); start++)
if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
count++;
if (--maxcount <= 0)
break;
start += pattern_len-1;
}
}
return count;
}
/* Algorithms for different cases of string replacement */ /* Algorithms for different cases of string replacement */
...@@ -1708,9 +1613,8 @@ replace_delete_substring(PyByteArrayObject *self, ...@@ -1708,9 +1613,8 @@ replace_delete_substring(PyByteArrayObject *self,
self_len = PyByteArray_GET_SIZE(self); self_len = PyByteArray_GET_SIZE(self);
self_s = PyByteArray_AS_STRING(self); self_s = PyByteArray_AS_STRING(self);
count = countstring(self_s, self_len, count = stringlib_count(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, 1,
maxcount); maxcount);
if (count == 0) { if (count == 0) {
...@@ -1730,9 +1634,9 @@ replace_delete_substring(PyByteArrayObject *self, ...@@ -1730,9 +1634,9 @@ replace_delete_substring(PyByteArrayObject *self,
start = self_s; start = self_s;
end = self_s + self_len; end = self_s + self_len;
while (count-- > 0) { while (count-- > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset == -1) if (offset == -1)
break; break;
next = start + offset; next = start + offset;
...@@ -1808,9 +1712,9 @@ replace_substring_in_place(PyByteArrayObject *self, ...@@ -1808,9 +1712,9 @@ replace_substring_in_place(PyByteArrayObject *self,
self_s = PyByteArray_AS_STRING(self); self_s = PyByteArray_AS_STRING(self);
self_len = PyByteArray_GET_SIZE(self); self_len = PyByteArray_GET_SIZE(self);
offset = findstring(self_s, self_len, offset = stringlib_find(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, FORWARD); 0);
if (offset == -1) { if (offset == -1) {
/* No matches; return the original bytes */ /* No matches; return the original bytes */
return return_self(self); return return_self(self);
...@@ -1830,9 +1734,9 @@ replace_substring_in_place(PyByteArrayObject *self, ...@@ -1830,9 +1734,9 @@ replace_substring_in_place(PyByteArrayObject *self,
end = result_s + self_len; end = result_s + self_len;
while ( --maxcount > 0) { while ( --maxcount > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset==-1) if (offset==-1)
break; break;
Py_MEMCPY(start+offset, to_s, from_len); Py_MEMCPY(start+offset, to_s, from_len);
...@@ -1925,9 +1829,10 @@ replace_substring(PyByteArrayObject *self, ...@@ -1925,9 +1829,10 @@ replace_substring(PyByteArrayObject *self,
self_s = PyByteArray_AS_STRING(self); self_s = PyByteArray_AS_STRING(self);
self_len = PyByteArray_GET_SIZE(self); self_len = PyByteArray_GET_SIZE(self);
count = countstring(self_s, self_len, count = stringlib_count(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, FORWARD, maxcount); maxcount);
if (count == 0) { if (count == 0) {
/* no matches, return unchanged */ /* no matches, return unchanged */
return return_self(self); return return_self(self);
...@@ -1954,9 +1859,9 @@ replace_substring(PyByteArrayObject *self, ...@@ -1954,9 +1859,9 @@ replace_substring(PyByteArrayObject *self,
start = self_s; start = self_s;
end = self_s + self_len; end = self_s + self_len;
while (count-- > 0) { while (count-- > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset == -1) if (offset == -1)
break; break;
next = start+offset; next = start+offset;
...@@ -2085,123 +1990,6 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args) ...@@ -2085,123 +1990,6 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args)
return res; return res;
} }
/* Overallocate the initial list to reduce the number of reallocs for small
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
text (roughly 11 words per line) and field delimited data (usually 1-10
fields). For large strings the split algorithms are bandwidth limited
so increasing the preallocation likely will not improve things.*/
#define MAX_PREALLOC 12
/* 5 splits gives 6 elements */
#define PREALLOC_SIZE(maxsplit) \
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
#define SPLIT_APPEND(data, left, right) \
str = PyByteArray_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
#define SPLIT_ADD(data, left, right) { \
str = PyByteArray_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (count < MAX_PREALLOC) { \
PyList_SET_ITEM(list, count, str); \
} else { \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str); \
} \
count++; }
/* Always force the list to the expected size. */
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = 0;
while ((j < len) && (maxcount-- > 0)) {
for(; j < len; j++) {
/* I found that using memchr makes no difference */
if (s[j] == ch) {
SPLIT_ADD(s, i, j);
i = j = j + 1;
break;
}
}
}
if (i <= len) {
SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = 0; i < len; ) {
/* find a token */
while (i < len && Py_ISSPACE(s[i]))
i++;
j = i;
while (i < len && !Py_ISSPACE(s[i]))
i++;
if (j < i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, j, i);
while (i < len && Py_ISSPACE(s[i]))
i++;
j = i;
}
}
if (j < len) {
SPLIT_ADD(s, j, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(split__doc__, PyDoc_STRVAR(split__doc__,
"B.split([sep[, maxsplit]]) -> list of bytearrays\n\ "B.split([sep[, maxsplit]]) -> list of bytearrays\n\
\n\ \n\
...@@ -2213,10 +2001,10 @@ If maxsplit is given, at most maxsplit splits are done."); ...@@ -2213,10 +2001,10 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytearray_split(PyByteArrayObject *self, PyObject *args) bytearray_split(PyByteArrayObject *self, PyObject *args)
{ {
Py_ssize_t len = PyByteArray_GET_SIZE(self), n, i, j, pos; Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1, count = 0; Py_ssize_t maxsplit = -1;
const char *s = PyByteArray_AS_STRING(self), *sub; const char *s = PyByteArray_AS_STRING(self), *sub;
PyObject *list, *str, *subobj = Py_None; PyObject *list, *subobj = Py_None;
Py_buffer vsub; Py_buffer vsub;
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
...@@ -2225,73 +2013,18 @@ bytearray_split(PyByteArrayObject *self, PyObject *args) ...@@ -2225,73 +2013,18 @@ bytearray_split(PyByteArrayObject *self, PyObject *args)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None) if (subobj == Py_None)
return split_whitespace(s, len, maxsplit); return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0) if (_getbuffer(subobj, &vsub) < 0)
return NULL; return NULL;
sub = vsub.buf; sub = vsub.buf;
n = vsub.len; n = vsub.len;
if (n == 0) { list = stringlib_split(
PyErr_SetString(PyExc_ValueError, "empty separator"); (PyObject*) self, s, len, sub, n, maxsplit
PyBuffer_Release(&vsub); );
return NULL;
}
if (n == 1) {
list = split_char(s, len, sub[0], maxsplit);
PyBuffer_Release(&vsub);
return list;
}
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) {
PyBuffer_Release(&vsub);
return NULL;
}
i = j = 0;
while (maxsplit-- > 0) {
pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
if (pos < 0)
break;
j = i+pos;
SPLIT_ADD(s, i, j);
i = j + n;
}
SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list);
PyBuffer_Release(&vsub); PyBuffer_Release(&vsub);
return list; return list;
onError:
Py_DECREF(list);
PyBuffer_Release(&vsub);
return NULL;
}
/* stringlib's partition shares nullbytes in some cases.
undo this, we don't want the nullbytes to be shared. */
static PyObject *
make_nullbytes_unique(PyObject *result)
{
if (result != NULL) {
int i;
assert(PyTuple_Check(result));
assert(PyTuple_GET_SIZE(result) == 3);
for (i = 0; i < 3; i++) {
if (PyTuple_GET_ITEM(result, i) == (PyObject *)nullbytes) {
PyObject *new = PyByteArray_FromStringAndSize(NULL, 0);
if (new == NULL) {
Py_DECREF(result);
result = NULL;
break;
}
Py_DECREF(nullbytes);
PyTuple_SET_ITEM(result, i, new);
}
}
}
return result;
} }
PyDoc_STRVAR(partition__doc__, PyDoc_STRVAR(partition__doc__,
...@@ -2318,7 +2051,7 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep_obj) ...@@ -2318,7 +2051,7 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep_obj)
); );
Py_DECREF(bytesep); Py_DECREF(bytesep);
return make_nullbytes_unique(result); return result;
} }
PyDoc_STRVAR(rpartition__doc__, PyDoc_STRVAR(rpartition__doc__,
...@@ -2346,81 +2079,7 @@ bytearray_rpartition(PyByteArrayObject *self, PyObject *sep_obj) ...@@ -2346,81 +2079,7 @@ bytearray_rpartition(PyByteArrayObject *self, PyObject *sep_obj)
); );
Py_DECREF(bytesep); Py_DECREF(bytesep);
return make_nullbytes_unique(result); return result;
}
Py_LOCAL_INLINE(PyObject *)
rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = len - 1;
while ((i >= 0) && (maxcount-- > 0)) {
for (; i >= 0; i--) {
if (s[i] == ch) {
SPLIT_ADD(s, i + 1, j + 1);
j = i = i - 1;
break;
}
}
}
if (j >= -1) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
while (i >= 0 && Py_ISSPACE(s[i]))
i--;
j = i;
while (i >= 0 && !Py_ISSPACE(s[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, i + 1, j + 1);
while (i >= 0 && Py_ISSPACE(s[i]))
i--;
j = i;
}
}
if (j >= 0) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
} }
PyDoc_STRVAR(rsplit__doc__, PyDoc_STRVAR(rsplit__doc__,
...@@ -2435,10 +2094,10 @@ If maxsplit is given, at most maxsplit splits are done."); ...@@ -2435,10 +2094,10 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytearray_rsplit(PyByteArrayObject *self, PyObject *args) bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
{ {
Py_ssize_t len = PyByteArray_GET_SIZE(self), n, j, pos; Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1, count = 0; Py_ssize_t maxsplit = -1;
const char *s = PyByteArray_AS_STRING(self), *sub; const char *s = PyByteArray_AS_STRING(self), *sub;
PyObject *list, *str, *subobj = Py_None; PyObject *list, *subobj = Py_None;
Py_buffer vsub; Py_buffer vsub;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
...@@ -2447,50 +2106,18 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args) ...@@ -2447,50 +2106,18 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None) if (subobj == Py_None)
return rsplit_whitespace(s, len, maxsplit); return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0) if (_getbuffer(subobj, &vsub) < 0)
return NULL; return NULL;
sub = vsub.buf; sub = vsub.buf;
n = vsub.len; n = vsub.len;
if (n == 0) { list = stringlib_rsplit(
PyErr_SetString(PyExc_ValueError, "empty separator"); (PyObject*) self, s, len, sub, n, maxsplit
PyBuffer_Release(&vsub); );
return NULL;
}
else if (n == 1) {
list = rsplit_char(s, len, sub[0], maxsplit);
PyBuffer_Release(&vsub);
return list;
}
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) {
PyBuffer_Release(&vsub);
return NULL;
}
j = len;
while (maxsplit-- > 0) {
pos = fastsearch(s, j, sub, n, FAST_RSEARCH);
if (pos < 0)
break;
SPLIT_ADD(s, pos + n, j);
j = pos;
}
SPLIT_ADD(s, 0, j);
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
PyBuffer_Release(&vsub); PyBuffer_Release(&vsub);
return list; return list;
onError:
Py_DECREF(list);
PyBuffer_Release(&vsub);
return NULL;
} }
PyDoc_STRVAR(reverse__doc__, PyDoc_STRVAR(reverse__doc__,
...@@ -2956,6 +2583,27 @@ bytearray_join(PyByteArrayObject *self, PyObject *it) ...@@ -2956,6 +2583,27 @@ bytearray_join(PyByteArrayObject *self, PyObject *it)
return NULL; return NULL;
} }
PyDoc_STRVAR(splitlines__doc__,
"B.splitlines([keepends]) -> list of lines\n\
\n\
Return a list of the lines in B, breaking at line boundaries.\n\
Line breaks are not included in the resulting list unless keepends\n\
is given and true.");
static PyObject*
bytearray_splitlines(PyObject *self, PyObject *args)
{
int keepends = 0;
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
return NULL;
return stringlib_splitlines(
(PyObject*) self, PyByteArray_AS_STRING(self),
PyByteArray_GET_SIZE(self), keepends
);
}
PyDoc_STRVAR(fromhex_doc, PyDoc_STRVAR(fromhex_doc,
"bytearray.fromhex(string) -> bytearray (static method)\n\ "bytearray.fromhex(string) -> bytearray (static method)\n\
\n\ \n\
...@@ -3134,7 +2782,7 @@ bytearray_methods[] = { ...@@ -3134,7 +2782,7 @@ bytearray_methods[] = {
{"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__}, {"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__},
{"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__}, {"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__},
{"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__}, {"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__},
{"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS, {"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS,
splitlines__doc__}, splitlines__doc__},
{"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS , {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS ,
startswith__doc__}, startswith__doc__},
......
...@@ -56,7 +56,7 @@ static PyBytesObject *nullstring; ...@@ -56,7 +56,7 @@ static PyBytesObject *nullstring;
If `str' is NULL then PyBytes_FromStringAndSize() will allocate `size+1' If `str' is NULL then PyBytes_FromStringAndSize() will allocate `size+1'
bytes (setting the last byte to the null terminating character) and you can bytes (setting the last byte to the null terminating character) and you can
fill in the data yourself. If `str' is non-NULL then the resulting fill in the data yourself. If `str' is non-NULL then the resulting
PyString object must be treated as immutable and you must not fill in nor PyBytes object must be treated as immutable and you must not fill in nor
alter the data yourself, since the strings may be shared. alter the data yourself, since the strings may be shared.
The PyObject member `op->ob_size', which denotes the number of "extra The PyObject member `op->ob_size', which denotes the number of "extra
...@@ -568,9 +568,9 @@ PyBytes_AsStringAndSize(register PyObject *obj, ...@@ -568,9 +568,9 @@ PyBytes_AsStringAndSize(register PyObject *obj,
#include "stringlib/count.h" #include "stringlib/count.h"
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/partition.h" #include "stringlib/partition.h"
#include "stringlib/split.h"
#include "stringlib/ctype.h" #include "stringlib/ctype.h"
#define STRINGLIB_MUTABLE 0
#include "stringlib/transmogrify.h" #include "stringlib/transmogrify.h"
PyObject * PyObject *
...@@ -1000,133 +1000,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; ...@@ -1000,133 +1000,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3) #define STRIPNAME(i) (stripformat[i]+3)
/* Don't call if length < 2 */
#define Py_STRING_MATCH(target, offset, pattern, length) \
(target[offset] == pattern[0] && \
target[offset+length-1] == pattern[length-1] && \
!memcmp(target+offset+1, pattern+1, length-2) )
/* Overallocate the initial list to reduce the number of reallocs for small
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
text (roughly 11 words per line) and field delimited data (usually 1-10
fields). For large strings the split algorithms are bandwidth limited
so increasing the preallocation likely will not improve things.*/
#define MAX_PREALLOC 12
/* 5 splits gives 6 elements */
#define PREALLOC_SIZE(maxsplit) \
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
#define SPLIT_ADD(data, left, right) { \
str = PyBytes_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (count < MAX_PREALLOC) { \
PyList_SET_ITEM(list, count, str); \
} else { \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str); \
} \
count++; }
/* Always force the list to the expected size. */
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
#define SKIP_SPACE(s, i, len) { while (i<len && ISSPACE(s[i])) i++; }
#define SKIP_NONSPACE(s, i, len) { while (i<len && !ISSPACE(s[i])) i++; }
#define RSKIP_SPACE(s, i) { while (i>=0 && ISSPACE(s[i])) i--; }
#define RSKIP_NONSPACE(s, i) { while (i>=0 && !ISSPACE(s[i])) i--; }
Py_LOCAL_INLINE(PyObject *)
split_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
{
const char *s = PyBytes_AS_STRING(self);
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
i = j = 0;
while (maxsplit-- > 0) {
SKIP_SPACE(s, i, len);
if (i==len) break;
j = i; i++;
SKIP_NONSPACE(s, i, len);
if (j == 0 && i == len && PyBytes_CheckExact(self)) {
/* No whitespace in self, so just use it as list[0] */
Py_INCREF(self);
PyList_SET_ITEM(list, 0, (PyObject *)self);
count++;
break;
}
SPLIT_ADD(s, j, i);
}
if (i < len) {
/* Only occurs when maxsplit was reached */
/* Skip any remaining whitespace and copy to end of string */
SKIP_SPACE(s, i, len);
if (i != len)
SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
split_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
const char *s = PyBytes_AS_STRING(self);
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = 0;
while ((j < len) && (maxcount-- > 0)) {
for(; j<len; j++) {
/* I found that using memchr makes no difference */
if (s[j] == ch) {
SPLIT_ADD(s, i, j);
i = j = j + 1;
break;
}
}
}
if (i == 0 && count == 0 && PyBytes_CheckExact(self)) {
/* ch not in self, so just use self as list[0] */
Py_INCREF(self);
PyList_SET_ITEM(list, 0, (PyObject *)self);
count++;
}
else if (i <= len) {
SPLIT_ADD(s, i, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(split__doc__, PyDoc_STRVAR(split__doc__,
"B.split([sep[, maxsplit]]) -> list of bytes\n\ "B.split([sep[, maxsplit]]) -> list of bytes\n\
\n\ \n\
...@@ -1138,74 +1011,26 @@ If maxsplit is given, at most maxsplit splits are done."); ...@@ -1138,74 +1011,26 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytes_split(PyBytesObject *self, PyObject *args) bytes_split(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; Py_ssize_t len = PyBytes_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1, count=0; Py_ssize_t maxsplit = -1;
const char *s = PyBytes_AS_STRING(self), *sub; const char *s = PyBytes_AS_STRING(self), *sub;
Py_buffer vsub; Py_buffer vsub;
PyObject *list, *str, *subobj = Py_None; PyObject *list, *subobj = Py_None;
#ifdef USE_FAST
Py_ssize_t pos;
#endif
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL; return NULL;
if (maxsplit < 0) if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None) if (subobj == Py_None)
return split_whitespace(self, len, maxsplit); return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0) if (_getbuffer(subobj, &vsub) < 0)
return NULL; return NULL;
sub = vsub.buf; sub = vsub.buf;
n = vsub.len; n = vsub.len;
if (n == 0) { list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
PyErr_SetString(PyExc_ValueError, "empty separator");
PyBuffer_Release(&vsub);
return NULL;
}
else if (n == 1) {
list = split_char(self, len, sub[0], maxsplit);
PyBuffer_Release(&vsub); PyBuffer_Release(&vsub);
return list; return list;
}
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) {
PyBuffer_Release(&vsub);
return NULL;
}
#ifdef USE_FAST
i = j = 0;
while (maxsplit-- > 0) {
pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
if (pos < 0)
break;
j = i+pos;
SPLIT_ADD(s, i, j);
i = j + n;
}
#else
i = j = 0;
while ((j+n <= len) && (maxsplit-- > 0)) {
for (; j+n <= len; j++) {
if (Py_STRING_MATCH(s, j, sub, n)) {
SPLIT_ADD(s, i, j);
i = j = j + n;
break;
}
}
}
#endif
SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list);
PyBuffer_Release(&vsub);
return list;
onError:
Py_DECREF(list);
PyBuffer_Release(&vsub);
return NULL;
} }
PyDoc_STRVAR(partition__doc__, PyDoc_STRVAR(partition__doc__,
...@@ -1263,90 +1088,6 @@ bytes_rpartition(PyBytesObject *self, PyObject *sep_obj) ...@@ -1263,90 +1088,6 @@ bytes_rpartition(PyBytesObject *self, PyObject *sep_obj)
); );
} }
Py_LOCAL_INLINE(PyObject *)
rsplit_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
{
const char *s = PyBytes_AS_STRING(self);
Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
i = j = len-1;
while (maxsplit-- > 0) {
RSKIP_SPACE(s, i);
if (i<0) break;
j = i; i--;
RSKIP_NONSPACE(s, i);
if (j == len-1 && i < 0 && PyBytes_CheckExact(self)) {
/* No whitespace in self, so just use it as list[0] */
Py_INCREF(self);
PyList_SET_ITEM(list, 0, (PyObject *)self);
count++;
break;
}
SPLIT_ADD(s, i + 1, j + 1);
}
if (i >= 0) {
/* Only occurs when maxsplit was reached. Skip any remaining
whitespace and copy to beginning of string. */
RSKIP_SPACE(s, i);
if (i >= 0)
SPLIT_ADD(s, 0, i + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
rsplit_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
const char *s = PyBytes_AS_STRING(self);
register Py_ssize_t i, j, count=0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = len - 1;
while ((i >= 0) && (maxcount-- > 0)) {
for (; i >= 0; i--) {
if (s[i] == ch) {
SPLIT_ADD(s, i + 1, j + 1);
j = i = i - 1;
break;
}
}
}
if (i < 0 && count == 0 && PyBytes_CheckExact(self)) {
/* ch not in self, so just use self as list[0] */
Py_INCREF(self);
PyList_SET_ITEM(list, 0, (PyObject *)self);
count++;
}
else if (j >= -1) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(rsplit__doc__, PyDoc_STRVAR(rsplit__doc__,
"B.rsplit([sep[, maxsplit]]) -> list of bytes\n\ "B.rsplit([sep[, maxsplit]]) -> list of bytes\n\
\n\ \n\
...@@ -1360,71 +1101,28 @@ If maxsplit is given, at most maxsplit splits are done."); ...@@ -1360,71 +1101,28 @@ If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytes_rsplit(PyBytesObject *self, PyObject *args) bytes_rsplit(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; Py_ssize_t len = PyBytes_GET_SIZE(self), n;
Py_ssize_t maxsplit = -1, count=0; Py_ssize_t maxsplit = -1;
const char *s, *sub; const char *s = PyBytes_AS_STRING(self), *sub;
Py_buffer vsub; Py_buffer vsub;
PyObject *list, *str, *subobj = Py_None; PyObject *list, *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL; return NULL;
if (maxsplit < 0) if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (subobj == Py_None) if (subobj == Py_None)
return rsplit_whitespace(self, len, maxsplit); return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0) if (_getbuffer(subobj, &vsub) < 0)
return NULL; return NULL;
sub = vsub.buf; sub = vsub.buf;
n = vsub.len; n = vsub.len;
if (n == 0) { list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
PyErr_SetString(PyExc_ValueError, "empty separator");
PyBuffer_Release(&vsub);
return NULL;
}
else if (n == 1) {
list = rsplit_char(self, len, sub[0], maxsplit);
PyBuffer_Release(&vsub); PyBuffer_Release(&vsub);
return list; return list;
}
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) {
PyBuffer_Release(&vsub);
return NULL;
}
j = len;
i = j - n;
s = PyBytes_AS_STRING(self);
while ( (i >= 0) && (maxsplit-- > 0) ) {
for (; i>=0; i--) {
if (Py_STRING_MATCH(s, i, sub, n)) {
SPLIT_ADD(s, i + n, j);
j = i;
i -= n;
break;
}
}
}
SPLIT_ADD(s, 0, j);
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
PyBuffer_Release(&vsub);
return list;
onError:
Py_DECREF(list);
PyBuffer_Release(&vsub);
return NULL;
} }
#undef SPLIT_ADD
#undef MAX_PREALLOC
#undef PREALLOC_SIZE
PyDoc_STRVAR(join__doc__, PyDoc_STRVAR(join__doc__,
"B.join(iterable_of_bytes) -> bytes\n\ "B.join(iterable_of_bytes) -> bytes\n\
...@@ -1531,20 +1229,20 @@ _PyBytes_Join(PyObject *sep, PyObject *x) ...@@ -1531,20 +1229,20 @@ _PyBytes_Join(PyObject *sep, PyObject *x)
return bytes_join(sep, x); return bytes_join(sep, x);
} }
Py_LOCAL_INLINE(void) /* helper macro to fixup start/end slice values */
bytes_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) #define ADJUST_INDICES(start, end, len) \
{ if (end > len) \
if (*end > len) end = len; \
*end = len; else if (end < 0) { \
else if (*end < 0) end += len; \
*end += len; if (end < 0) \
if (*end < 0) end = 0; \
*end = 0; } \
if (*start < 0) if (start < 0) { \
*start += len; start += len; \
if (*start < 0) if (start < 0) \
*start = 0; start = 0; \
} }
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
...@@ -1591,7 +1289,7 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) ...@@ -1591,7 +1289,7 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
PyDoc_STRVAR(find__doc__, PyDoc_STRVAR(find__doc__,
"B.find(sub[, start[, end]]) -> int\n\ "B.find(sub[, start[, end]]) -> int\n\
\n\ \n\
Return the lowest index in S where substring sub is found,\n\ Return the lowest index in B where substring sub is found,\n\
such that sub is contained within s[start:end]. Optional\n\ such that sub is contained within s[start:end]. Optional\n\
arguments start and end are interpreted as in slice notation.\n\ arguments start and end are interpreted as in slice notation.\n\
\n\ \n\
...@@ -1801,7 +1499,7 @@ PyDoc_STRVAR(count__doc__, ...@@ -1801,7 +1499,7 @@ PyDoc_STRVAR(count__doc__,
"B.count(sub[, start[, end]]) -> int\n\ "B.count(sub[, start[, end]]) -> int\n\
\n\ \n\
Return the number of non-overlapping occurrences of substring sub in\n\ Return the number of non-overlapping occurrences of substring sub in\n\
string S[start:end]. Optional arguments start and end are interpreted\n\ string B[start:end]. Optional arguments start and end are interpreted\n\
as in slice notation."); as in slice notation.");
static PyObject * static PyObject *
...@@ -1823,10 +1521,10 @@ bytes_count(PyBytesObject *self, PyObject *args) ...@@ -1823,10 +1521,10 @@ bytes_count(PyBytesObject *self, PyObject *args)
else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len)) else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
return NULL; return NULL;
bytes_adjust_indices(&start, &end, PyBytes_GET_SIZE(self)); ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self));
return PyLong_FromSsize_t( return PyLong_FromSsize_t(
stringlib_count(str + start, end - start, sub, sub_len) stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
); );
} }
...@@ -1943,9 +1641,6 @@ bytes_maketrans(PyObject *null, PyObject *args) ...@@ -1943,9 +1641,6 @@ bytes_maketrans(PyObject *null, PyObject *args)
return _Py_bytes_maketrans(args); return _Py_bytes_maketrans(args);
} }
#define FORWARD 1
#define REVERSE -1
/* find and count characters and substrings */ /* find and count characters and substrings */
#define findchar(target, target_len, c) \ #define findchar(target, target_len, c) \
...@@ -1981,94 +1676,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount) ...@@ -1981,94 +1676,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
return count; return count;
} }
Py_LOCAL(Py_ssize_t)
findstring(const char *target, Py_ssize_t target_len,
const char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings always match at the first attempt */
if (pattern_len == 0)
return (direction > 0) ? start : end;
end -= pattern_len;
if (direction < 0) {
for (; end >= start; end--)
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
return end;
} else {
for (; start <= end; start++)
if (Py_STRING_MATCH(target, start,pattern,pattern_len))
return start;
}
return -1;
}
Py_LOCAL_INLINE(Py_ssize_t)
countstring(const char *target, Py_ssize_t target_len,
const char *pattern, Py_ssize_t pattern_len,
Py_ssize_t start,
Py_ssize_t end,
int direction, Py_ssize_t maxcount)
{
Py_ssize_t count=0;
if (start < 0) {
start += target_len;
if (start < 0)
start = 0;
}
if (end > target_len) {
end = target_len;
} else if (end < 0) {
end += target_len;
if (end < 0)
end = 0;
}
/* zero-length substrings match everywhere */
if (pattern_len == 0 || maxcount == 0) {
if (target_len+1 < maxcount)
return target_len+1;
return maxcount;
}
end -= pattern_len;
if (direction < 0) {
for (; (end >= start); end--)
if (Py_STRING_MATCH(target, end,pattern,pattern_len)) {
count++;
if (--maxcount <= 0) break;
end -= pattern_len-1;
}
} else {
for (; (start <= end); start++)
if (Py_STRING_MATCH(target, start,
pattern, pattern_len)) {
count++;
if (--maxcount <= 0)
break;
start += pattern_len-1;
}
}
return count;
}
/* Algorithms for different cases of string replacement */ /* Algorithms for different cases of string replacement */
...@@ -2189,9 +1796,8 @@ replace_delete_substring(PyBytesObject *self, ...@@ -2189,9 +1796,8 @@ replace_delete_substring(PyBytesObject *self,
self_len = PyBytes_GET_SIZE(self); self_len = PyBytes_GET_SIZE(self);
self_s = PyBytes_AS_STRING(self); self_s = PyBytes_AS_STRING(self);
count = countstring(self_s, self_len, count = stringlib_count(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, 1,
maxcount); maxcount);
if (count == 0) { if (count == 0) {
...@@ -2211,9 +1817,9 @@ replace_delete_substring(PyBytesObject *self, ...@@ -2211,9 +1817,9 @@ replace_delete_substring(PyBytesObject *self,
start = self_s; start = self_s;
end = self_s + self_len; end = self_s + self_len;
while (count-- > 0) { while (count-- > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset == -1) if (offset == -1)
break; break;
next = start + offset; next = start + offset;
...@@ -2289,9 +1895,9 @@ replace_substring_in_place(PyBytesObject *self, ...@@ -2289,9 +1895,9 @@ replace_substring_in_place(PyBytesObject *self,
self_s = PyBytes_AS_STRING(self); self_s = PyBytes_AS_STRING(self);
self_len = PyBytes_GET_SIZE(self); self_len = PyBytes_GET_SIZE(self);
offset = findstring(self_s, self_len, offset = stringlib_find(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, FORWARD); 0);
if (offset == -1) { if (offset == -1) {
/* No matches; return the original string */ /* No matches; return the original string */
return return_self(self); return return_self(self);
...@@ -2311,9 +1917,9 @@ replace_substring_in_place(PyBytesObject *self, ...@@ -2311,9 +1917,9 @@ replace_substring_in_place(PyBytesObject *self,
end = result_s + self_len; end = result_s + self_len;
while ( --maxcount > 0) { while ( --maxcount > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset==-1) if (offset==-1)
break; break;
Py_MEMCPY(start+offset, to_s, from_len); Py_MEMCPY(start+offset, to_s, from_len);
...@@ -2407,9 +2013,10 @@ replace_substring(PyBytesObject *self, ...@@ -2407,9 +2013,10 @@ replace_substring(PyBytesObject *self,
self_s = PyBytes_AS_STRING(self); self_s = PyBytes_AS_STRING(self);
self_len = PyBytes_GET_SIZE(self); self_len = PyBytes_GET_SIZE(self);
count = countstring(self_s, self_len, count = stringlib_count(self_s, self_len,
from_s, from_len, from_s, from_len,
0, self_len, FORWARD, maxcount); maxcount);
if (count == 0) { if (count == 0) {
/* no matches, return unchanged */ /* no matches, return unchanged */
return return_self(self); return return_self(self);
...@@ -2438,9 +2045,9 @@ replace_substring(PyBytesObject *self, ...@@ -2438,9 +2045,9 @@ replace_substring(PyBytesObject *self,
start = self_s; start = self_s;
end = self_s + self_len; end = self_s + self_len;
while (count-- > 0) { while (count-- > 0) {
offset = findstring(start, end-start, offset = stringlib_find(start, end-start,
from_s, from_len, from_s, from_len,
0, end-start, FORWARD); 0);
if (offset == -1) if (offset == -1)
break; break;
next = start+offset; next = start+offset;
...@@ -2598,7 +2205,7 @@ _bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start, ...@@ -2598,7 +2205,7 @@ _bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start,
return -1; return -1;
str = PyBytes_AS_STRING(self); str = PyBytes_AS_STRING(self);
bytes_adjust_indices(&start, &end, len); ADJUST_INDICES(start, end, len);
if (direction < 0) { if (direction < 0) {
/* startswith */ /* startswith */
...@@ -2703,7 +2310,7 @@ bytes_endswith(PyBytesObject *self, PyObject *args) ...@@ -2703,7 +2310,7 @@ bytes_endswith(PyBytesObject *self, PyObject *args)
PyDoc_STRVAR(decode__doc__, PyDoc_STRVAR(decode__doc__,
"B.decode([encoding[, errors]]) -> str\n\ "B.decode([encoding[, errors]]) -> str\n\
\n\ \n\
Decode S using the codec registered for encoding. encoding defaults\n\ Decode B using the codec registered for encoding. encoding defaults\n\
to the default encoding. errors may be given to set a different error\n\ to the default encoding. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
...@@ -2725,6 +2332,28 @@ bytes_decode(PyObject *self, PyObject *args, PyObject *kwargs) ...@@ -2725,6 +2332,28 @@ bytes_decode(PyObject *self, PyObject *args, PyObject *kwargs)
} }
PyDoc_STRVAR(splitlines__doc__,
"B.splitlines([keepends]) -> list of lines\n\
\n\
Return a list of the lines in B, breaking at line boundaries.\n\
Line breaks are not included in the resulting list unless keepends\n\
is given and true.");
static PyObject*
bytes_splitlines(PyObject *self, PyObject *args)
{
int keepends = 0;
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
return NULL;
return stringlib_splitlines(
(PyObject*) self, PyBytes_AS_STRING(self),
PyBytes_GET_SIZE(self), keepends
);
}
PyDoc_STRVAR(fromhex_doc, PyDoc_STRVAR(fromhex_doc,
"bytes.fromhex(string) -> bytes\n\ "bytes.fromhex(string) -> bytes\n\
\n\ \n\
...@@ -2857,7 +2486,7 @@ bytes_methods[] = { ...@@ -2857,7 +2486,7 @@ bytes_methods[] = {
{"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__}, {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__},
{"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__}, {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
{"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__}, {"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__},
{"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS, {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS,
splitlines__doc__}, splitlines__doc__},
{"startswith", (PyCFunction)bytes_startswith, METH_VARARGS, {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS,
startswith__doc__}, startswith__doc__},
...@@ -3239,7 +2868,7 @@ _PyBytes_Resize(PyObject **pv, Py_ssize_t newsize) ...@@ -3239,7 +2868,7 @@ _PyBytes_Resize(PyObject **pv, Py_ssize_t newsize)
/* _PyBytes_FormatLong emulates the format codes d, u, o, x and X, and /* _PyBytes_FormatLong emulates the format codes d, u, o, x and X, and
* the F_ALT flag, for Python's long (unbounded) ints. It's not used for * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
* Python's regular ints. * Python's regular ints.
* Return value: a new PyString*, or NULL if error. * Return value: a new PyBytes*, or NULL if error.
* . *pbuf is set to point into it, * . *pbuf is set to point into it,
* *plen set to the # of chars following that. * *plen set to the # of chars following that.
* Caller must decref it when done using pbuf. * Caller must decref it when done using pbuf.
......
...@@ -9,28 +9,22 @@ ...@@ -9,28 +9,22 @@
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len, stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len) const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
Py_ssize_t maxcount)
{ {
Py_ssize_t count; Py_ssize_t count;
if (str_len < 0) if (str_len < 0)
return 0; /* start > len(str) */ return 0; /* start > len(str) */
if (sub_len == 0) if (sub_len == 0)
return str_len + 1; return (str_len < maxcount) ? str_len + 1 : maxcount;
count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT); count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT);
if (count < 0) if (count < 0)
count = 0; /* no match */ return 0; /* no match */
return count; return count;
} }
#endif #endif
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
...@@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self) ...@@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self)
STRINGLIB_LEN(self)); STRINGLIB_LEN(self));
return newobj; return newobj;
} }
...@@ -18,10 +18,13 @@ ...@@ -18,10 +18,13 @@
#define FAST_SEARCH 1 #define FAST_SEARCH 1
#define FAST_RSEARCH 2 #define FAST_RSEARCH 2
#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m, const STRINGLIB_CHAR* p, Py_ssize_t m,
int mode) Py_ssize_t maxcount, int mode)
{ {
long mask; long mask;
Py_ssize_t skip, count = 0; Py_ssize_t skip, count = 0;
...@@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
w = n - m; w = n - m;
if (w < 0) if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
return -1; return -1;
/* look for special cases */ /* look for special cases */
...@@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* use special case for 1-character strings */ /* use special case for 1-character strings */
if (mode == FAST_COUNT) { if (mode == FAST_COUNT) {
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
if (s[i] == p[0]) if (s[i] == p[0]) {
count++; count++;
if (count == maxcount)
return maxcount;
}
return count; return count;
} else if (mode == FAST_SEARCH) { } else if (mode == FAST_SEARCH) {
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
...@@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
mlast = m - 1; mlast = m - 1;
skip = mlast - 1; skip = mlast - 1;
mask = 0;
if (mode != FAST_RSEARCH) { if (mode != FAST_RSEARCH) {
/* create compressed boyer-moore delta 1 table */ /* create compressed boyer-moore delta 1 table */
/* process pattern[:-1] */ /* process pattern[:-1] */
for (mask = i = 0; i < mlast; i++) { for (i = 0; i < mlast; i++) {
mask |= (1 << (p[i] & 0x1F)); BLOOM_ADD(mask, p[i]);
if (p[i] == p[mlast]) if (p[i] == p[mlast])
skip = mlast - i - 1; skip = mlast - i - 1;
} }
/* process pattern[-1] outside the loop */ /* process pattern[-1] outside the loop */
mask |= (1 << (p[mlast] & 0x1F)); BLOOM_ADD(mask, p[mlast]);
for (i = 0; i <= w; i++) { for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */ /* note: using mlast in the skip path slows things down on x86 */
...@@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
if (mode != FAST_COUNT) if (mode != FAST_COUNT)
return i; return i;
count++; count++;
if (count == maxcount)
return maxcount;
i = i + mlast; i = i + mlast;
continue; continue;
} }
/* miss: check if next character is part of pattern */ /* miss: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F)))) if (!BLOOM(mask, s[i+m]))
i = i + m; i = i + m;
else else
i = i + skip; i = i + skip;
} else { } else {
/* skip: check if next character is part of pattern */ /* skip: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F)))) if (!BLOOM(mask, s[i+m]))
i = i + m; i = i + m;
} }
} }
...@@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* create compressed boyer-moore delta 1 table */ /* create compressed boyer-moore delta 1 table */
/* process pattern[0] outside the loop */ /* process pattern[0] outside the loop */
mask = (1 << (p[0] & 0x1F)); BLOOM_ADD(mask, p[0]);
/* process pattern[:0:-1] */ /* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) { for (i = mlast; i > 0; i--) {
mask |= (1 << (p[i] & 0x1F)); BLOOM_ADD(mask, p[i]);
if (p[i] == p[0]) if (p[i] == p[0])
skip = i - 1; skip = i - 1;
} }
...@@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* got a match! */ /* got a match! */
return i; return i;
/* miss: check if previous character is part of pattern */ /* miss: check if previous character is part of pattern */
if (!(mask & (1 << (s[i-1] & 0x1F)))) if (!BLOOM(mask, s[i-1]))
i = i - m; i = i - m;
else else
i = i - skip; i = i - skip;
} else { } else {
/* skip: check if previous character is part of pattern */ /* skip: check if previous character is part of pattern */
if (!(mask & (1 << (s[i-1] & 0x1F)))) if (!BLOOM(mask, s[i-1]))
i = i - m; i = i - m;
} }
} }
...@@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, ...@@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
} }
#endif #endif
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
...@@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len, ...@@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
if (sub_len == 0) if (sub_len == 0)
return offset; return offset;
pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH); pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH);
if (pos >= 0) if (pos >= 0)
pos += offset; pos += offset;
...@@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, ...@@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
if (sub_len == 0) if (sub_len == 0)
return str_len + offset; return str_len + offset;
pos = fastsearch(str, str_len, sub, sub_len, FAST_RSEARCH); pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH);
if (pos >= 0) if (pos >= 0)
pos += offset; pos += offset;
...@@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, ...@@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
return pos; return pos;
} }
/* helper macro to fixup start/end slice values */
#define ADJUST_INDICES(start, end, len) \
if (end > len) \
end = len; \
else if (end < 0) { \
end += len; \
if (end < 0) \
end = 0; \
} \
if (start < 0) { \
start += len; \
if (start < 0) \
start = 0; \
}
Py_LOCAL_INLINE(Py_ssize_t) Py_LOCAL_INLINE(Py_ssize_t)
stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
Py_ssize_t start, Py_ssize_t end) Py_ssize_t start, Py_ssize_t end)
{ {
if (start < 0) ADJUST_INDICES(start, end, str_len);
start += str_len;
if (start < 0)
start = 0;
if (end > str_len)
end = str_len;
if (end < 0)
end += str_len;
if (end < 0)
end = 0;
return stringlib_find(str + start, end - start, sub, sub_len, start); return stringlib_find(str + start, end - start, sub, sub_len, start);
} }
...@@ -71,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, ...@@ -71,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
Py_ssize_t start, Py_ssize_t end) Py_ssize_t start, Py_ssize_t end)
{ {
if (start < 0) ADJUST_INDICES(start, end, str_len);
start += str_len;
if (start < 0)
start = 0;
if (end > str_len)
end = str_len;
if (end < 0)
end += str_len;
if (end < 0)
end = 0;
return stringlib_rfind(str + start, end - start, sub, sub_len, start); return stringlib_rfind(str + start, end - start, sub, sub_len, start);
} }
...@@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub) ...@@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub)
) != -1; ) != -1;
} }
#endif /* STRINGLIB_STR */ #endif /* STRINGLIB_WANT_CONTAINS_OBJ */
#ifdef FROM_UNICODE #if STRINGLIB_IS_UNICODE
/* /*
This function is a helper for the "find" family (find, rfind, index, This function is a helper for the "find" family (find, rfind, index,
...@@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring, ...@@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring,
return 1; return 1;
} }
#endif /* FROM_UNICODE */ #endif /* STRINGLIB_IS_UNICODE */
#endif /* STRINGLIB_FIND_H */ #endif /* STRINGLIB_FIND_H */
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
...@@ -8,10 +8,10 @@ ...@@ -8,10 +8,10 @@
#endif #endif
Py_LOCAL_INLINE(PyObject*) Py_LOCAL_INLINE(PyObject*)
stringlib_partition( stringlib_partition(PyObject* str_obj,
PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len PyObject* sep_obj,
) const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
{ {
PyObject* out; PyObject* out;
Py_ssize_t pos; Py_ssize_t pos;
...@@ -25,15 +25,21 @@ stringlib_partition( ...@@ -25,15 +25,21 @@ stringlib_partition(
if (!out) if (!out)
return NULL; return NULL;
pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH); pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH);
if (pos < 0) { if (pos < 0) {
#if STRINGLIB_MUTABLE
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len));
PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0));
#else
Py_INCREF(str_obj); Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
Py_INCREF(STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY);
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
Py_INCREF(STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY);
PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
#endif
return out; return out;
} }
...@@ -52,10 +58,10 @@ stringlib_partition( ...@@ -52,10 +58,10 @@ stringlib_partition(
} }
Py_LOCAL_INLINE(PyObject*) Py_LOCAL_INLINE(PyObject*)
stringlib_rpartition( stringlib_rpartition(PyObject* str_obj,
PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len PyObject* sep_obj,
) const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
{ {
PyObject* out; PyObject* out;
Py_ssize_t pos; Py_ssize_t pos;
...@@ -69,15 +75,21 @@ stringlib_rpartition( ...@@ -69,15 +75,21 @@ stringlib_rpartition(
if (!out) if (!out)
return NULL; return NULL;
pos = fastsearch(str, str_len, sep, sep_len, FAST_RSEARCH); pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH);
if (pos < 0) { if (pos < 0) {
#if STRINGLIB_MUTABLE
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0));
PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len));
#else
Py_INCREF(STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY);
PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
Py_INCREF(STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY);
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
Py_INCREF(str_obj); Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
#endif
return out; return out;
} }
...@@ -96,10 +108,3 @@ stringlib_rpartition( ...@@ -96,10 +108,3 @@ stringlib_rpartition(
} }
#endif #endif
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
/* stringlib: split implementation */
#ifndef STRINGLIB_SPLIT_H
#define STRINGLIB_SPLIT_H
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#endif
/* Overallocate the initial list to reduce the number of reallocs for small
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
text (roughly 11 words per line) and field delimited data (usually 1-10
fields). For large strings the split algorithms are bandwidth limited
so increasing the preallocation likely will not improve things.*/
#define MAX_PREALLOC 12
/* 5 splits gives 6 elements */
#define PREALLOC_SIZE(maxsplit) \
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
#define SPLIT_APPEND(data, left, right) \
sub = STRINGLIB_NEW((data) + (left), \
(right) - (left)); \
if (sub == NULL) \
goto onError; \
if (PyList_Append(list, sub)) { \
Py_DECREF(sub); \
goto onError; \
} \
else \
Py_DECREF(sub);
#define SPLIT_ADD(data, left, right) { \
sub = STRINGLIB_NEW((data) + (left), \
(right) - (left)); \
if (sub == NULL) \
goto onError; \
if (count < MAX_PREALLOC) { \
PyList_SET_ITEM(list, count, sub); \
} else { \
if (PyList_Append(list, sub)) { \
Py_DECREF(sub); \
goto onError; \
} \
else \
Py_DECREF(sub); \
} \
count++; }
/* Always force the list to the expected size. */
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
Py_LOCAL_INLINE(PyObject *)
stringlib_split_whitespace(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = 0;
while (maxcount-- > 0) {
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
i++;
if (i == str_len) break;
j = i; i++;
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
i++;
#ifndef STRINGLIB_MUTABLE
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No whitespace in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
break;
}
#endif
SPLIT_ADD(str, j, i);
}
if (i < str_len) {
/* Only occurs when maxcount was reached */
/* Skip any remaining whitespace and copy to end of string */
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
i++;
if (i != str_len)
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_split_char(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR ch,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = 0;
while ((j < str_len) && (maxcount-- > 0)) {
for(; j < str_len; j++) {
/* I found that using memchr makes no difference */
if (str[j] == ch) {
SPLIT_ADD(str, i, j);
i = j = j + 1;
break;
}
}
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* ch not in str_obj, so just use str_obj as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
if (i <= str_len) {
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_split(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, pos, count=0;
PyObject *list, *sub;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
else if (sep_len == 1)
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = 0;
while (maxcount-- > 0) {
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
if (pos < 0)
break;
j = i + pos;
SPLIT_ADD(str, i, j);
i = j + sep_len;
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No match in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
{
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_whitespace(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = str_len - 1;
while (maxcount-- > 0) {
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
i--;
if (i < 0) break;
j = i; i--;
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
i--;
#ifndef STRINGLIB_MUTABLE
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No whitespace in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
break;
}
#endif
SPLIT_ADD(str, i + 1, j + 1);
}
if (i >= 0) {
/* Only occurs when maxcount was reached */
/* Skip any remaining whitespace and copy to beginning of string */
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
i--;
if (i >= 0)
SPLIT_ADD(str, 0, i + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_char(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR ch,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = str_len - 1;
while ((i >= 0) && (maxcount-- > 0)) {
for(; i >= 0; i--) {
if (str[i] == ch) {
SPLIT_ADD(str, i + 1, j + 1);
j = i = i - 1;
break;
}
}
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* ch not in str_obj, so just use str_obj as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
if (j >= -1) {
SPLIT_ADD(str, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
Py_ssize_t maxcount)
{
Py_ssize_t j, pos, count=0;
PyObject *list, *sub;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
else if (sep_len == 1)
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
j = str_len;
while (maxcount-- > 0) {
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
if (pos < 0)
break;
SPLIT_ADD(str, pos + sep_len, j);
j = pos;
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No match in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
{
SPLIT_ADD(str, 0, j);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_splitlines(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
int keepends)
{
/* This does not use the preallocated list because splitlines is
usually run with hundreds of newlines. The overhead of
switching between PyList_SET_ITEM and append causes about a
2-3% slowdown for that common case. A smarter implementation
could move the if check out, so the SET_ITEMs are done first
and the appends only done when the prealloc buffer is full.
That's too much work for little gain.*/
register Py_ssize_t i;
register Py_ssize_t j;
PyObject *list = PyList_New(0);
PyObject *sub;
if (list == NULL)
return NULL;
for (i = j = 0; i < str_len; ) {
Py_ssize_t eol;
/* Find a line and append it */
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
i++;
/* Skip the line break reading CRLF as one line break */
eol = i;
if (i < str_len) {
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
i += 2;
else
i++;
if (keepends)
eol = i;
}
#ifndef STRINGLIB_MUTABLE
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No linebreak in str_obj, so just use it as list[0] */
if (PyList_Append(list, str_obj))
goto onError;
break;
}
#endif
SPLIT_APPEND(str, j, eol);
j = i;
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
#endif
/* stringlib: split implementation */
#ifndef STRINGLIB_SPLIT_H
#define STRINGLIB_SPLIT_H
#ifndef STRINGLIB_FASTSEARCH_H
#error must include "stringlib/fastsearch.h" before including this module
#endif
/* Overallocate the initial list to reduce the number of reallocs for small
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
text (roughly 11 words per line) and field delimited data (usually 1-10
fields). For large strings the split algorithms are bandwidth limited
so increasing the preallocation likely will not improve things.*/
#define MAX_PREALLOC 12
/* 5 splits gives 6 elements */
#define PREALLOC_SIZE(maxsplit) \
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
#define SPLIT_APPEND(data, left, right) \
sub = STRINGLIB_NEW((data) + (left), \
(right) - (left)); \
if (sub == NULL) \
goto onError; \
if (PyList_Append(list, sub)) { \
Py_DECREF(sub); \
goto onError; \
} \
else \
Py_DECREF(sub);
#define SPLIT_ADD(data, left, right) { \
sub = STRINGLIB_NEW((data) + (left), \
(right) - (left)); \
if (sub == NULL) \
goto onError; \
if (count < MAX_PREALLOC) { \
PyList_SET_ITEM(list, count, sub); \
} else { \
if (PyList_Append(list, sub)) { \
Py_DECREF(sub); \
goto onError; \
} \
else \
Py_DECREF(sub); \
} \
count++; }
/* Always force the list to the expected size. */
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
Py_LOCAL_INLINE(PyObject *)
stringlib_split_whitespace(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = 0;
while (maxcount-- > 0) {
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
i++;
if (i == str_len) break;
j = i; i++;
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
i++;
#ifndef STRINGLIB_MUTABLE
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No whitespace in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
break;
}
#endif
SPLIT_ADD(str, j, i);
}
if (i < str_len) {
/* Only occurs when maxcount was reached */
/* Skip any remaining whitespace and copy to end of string */
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
i++;
if (i != str_len)
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_split_char(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR ch,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = 0;
while ((j < str_len) && (maxcount-- > 0)) {
for(; j < str_len; j++) {
/* I found that using memchr makes no difference */
if (str[j] == ch) {
SPLIT_ADD(str, i, j);
i = j = j + 1;
break;
}
}
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* ch not in str_obj, so just use str_obj as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
if (i <= str_len) {
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_split(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, pos, count=0;
PyObject *list, *sub;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
else if (sep_len == 1)
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
i = j = 0;
while (maxcount-- > 0) {
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
if (pos < 0)
break;
j = i + pos;
SPLIT_ADD(str, i, j);
i = j + sep_len;
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No match in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
{
SPLIT_ADD(str, i, str_len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_whitespace(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = str_len - 1;
while (maxcount-- > 0) {
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
i--;
if (i < 0) break;
j = i; i--;
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
i--;
#ifndef STRINGLIB_MUTABLE
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No whitespace in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
break;
}
#endif
SPLIT_ADD(str, i + 1, j + 1);
}
if (i >= 0) {
/* Only occurs when maxcount was reached */
/* Skip any remaining whitespace and copy to beginning of string */
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
i--;
if (i >= 0)
SPLIT_ADD(str, 0, i + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_char(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR ch,
Py_ssize_t maxcount)
{
Py_ssize_t i, j, count=0;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
PyObject *sub;
if (list == NULL)
return NULL;
i = j = str_len - 1;
while ((i >= 0) && (maxcount-- > 0)) {
for(; i >= 0; i--) {
if (str[i] == ch) {
SPLIT_ADD(str, i + 1, j + 1);
j = i = i - 1;
break;
}
}
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* ch not in str_obj, so just use str_obj as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
if (j >= -1) {
SPLIT_ADD(str, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
Py_ssize_t maxcount)
{
Py_ssize_t j, pos, count=0;
PyObject *list, *sub;
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
else if (sep_len == 1)
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
j = str_len;
while (maxcount-- > 0) {
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
if (pos < 0)
break;
SPLIT_ADD(str, pos + sep_len, j);
j = pos;
}
#ifndef STRINGLIB_MUTABLE
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No match in str_obj, so just use it as list[0] */
Py_INCREF(str_obj);
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
count++;
} else
#endif
{
SPLIT_ADD(str, 0, j);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
stringlib_splitlines(PyObject* str_obj,
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
int keepends)
{
/* This does not use the preallocated list because splitlines is
usually run with hundreds of newlines. The overhead of
switching between PyList_SET_ITEM and append causes about a
2-3% slowdown for that common case. A smarter implementation
could move the if check out, so the SET_ITEMs are done first
and the appends only done when the prealloc buffer is full.
That's too much work for little gain.*/
register Py_ssize_t i;
register Py_ssize_t j;
PyObject *list = PyList_New(0);
PyObject *sub;
if (list == NULL)
return NULL;
for (i = j = 0; i < str_len; ) {
Py_ssize_t eol;
/* Find a line and append it */
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
i++;
/* Skip the line break reading CRLF as one line break */
eol = i;
if (i < str_len) {
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
i += 2;
else
i++;
if (keepends)
eol = i;
}
#ifndef STRINGLIB_MUTABLE
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
/* No linebreak in str_obj, so just use it as list[0] */
if (PyList_Append(list, str_obj))
goto onError;
break;
}
#endif
SPLIT_APPEND(str, j, eol);
j = i;
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
#endif
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
#define STRINGLIB_TYPE_NAME "string" #define STRINGLIB_TYPE_NAME "string"
#define STRINGLIB_PARSE_CODE "S" #define STRINGLIB_PARSE_CODE "S"
#define STRINGLIB_EMPTY nullstring #define STRINGLIB_EMPTY nullstring
#define STRINGLIB_ISSPACE Py_ISSPACE
#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) #define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9'))
#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) #define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1)
#define STRINGLIB_TOUPPER Py_TOUPPER #define STRINGLIB_TOUPPER Py_TOUPPER
......
/* NOTE: this API is -ONLY- for use with single byte character strings. */ /* NOTE: this API is -ONLY- for use with single byte character strings. */
/* Do not use it with Unicode. */ /* Do not use it with Unicode. */
#include "bytes_methods.h"
#ifndef STRINGLIB_MUTABLE
#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0"
#define STRINGLIB_MUTABLE 0
#endif
/* the more complicated methods. parts of these should be pulled out into the /* the more complicated methods. parts of these should be pulled out into the
shared code in bytes_methods.c to cut down on duplicate code bloat. */ shared code in bytes_methods.c to cut down on duplicate code bloat. */
...@@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args) ...@@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args)
return (PyObject*) s; return (PyObject*) s;
} }
#define _STRINGLIB_SPLIT_APPEND(data, left, right) \
str = STRINGLIB_NEW((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
PyDoc_STRVAR(splitlines__doc__,
"B.splitlines([keepends]) -> list of lines\n\
\n\
Return a list of the lines in B, breaking at line boundaries.\n\
Line breaks are not included in the resulting list unless keepends\n\
is given and true.");
static PyObject*
stringlib_splitlines(PyObject *self, PyObject *args)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len;
int keepends = 0;
PyObject *list;
PyObject *str;
char *data;
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
return NULL;
data = STRINGLIB_STR(self);
len = STRINGLIB_LEN(self);
/* This does not use the preallocated list because splitlines is
usually run with hundreds of newlines. The overhead of
switching between PyList_SET_ITEM and append causes about a
2-3% slowdown for that common case. A smarter implementation
could move the if check out, so the SET_ITEMs are done first
and the appends only done when the prealloc buffer is full.
That's too much work for little gain.*/
list = PyList_New(0);
if (!list)
goto onError;
for (i = j = 0; i < len; ) {
Py_ssize_t eol;
/* Find a line and append it */
while (i < len && data[i] != '\n' && data[i] != '\r')
i++;
/* Skip the line break reading CRLF as one line break */
eol = i;
if (i < len) {
if (data[i] == '\r' && i + 1 < len &&
data[i+1] == '\n')
i += 2;
else
i++;
if (keepends)
eol = i;
}
_STRINGLIB_SPLIT_APPEND(data, j, eol);
j = i;
}
if (j < len) {
_STRINGLIB_SPLIT_APPEND(data, j, len);
}
return list;
onError:
Py_XDECREF(list);
return NULL;
}
#undef _STRINGLIB_SPLIT_APPEND
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
#define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_TYPE_NAME "unicode"
#define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_PARSE_CODE "U"
#define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_EMPTY unicode_empty
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER
......
...@@ -210,7 +210,8 @@ PyUnicode_GetMax(void) ...@@ -210,7 +210,8 @@ PyUnicode_GetMax(void)
static BLOOM_MASK bloom_linebreak; static BLOOM_MASK bloom_linebreak;
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) #define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
#define BLOOM_LINEBREAK(ch) \ #define BLOOM_LINEBREAK(ch) \
((ch) < 128U ? ascii_linebreak[(ch)] : \ ((ch) < 128U ? ascii_linebreak[(ch)] : \
...@@ -225,7 +226,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) ...@@ -225,7 +226,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
mask = 0; mask = 0;
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
mask |= (1 << (ptr[i] & 0x1F)); BLOOM_ADD(mask, ptr[i]);
return mask; return mask;
} }
...@@ -5873,28 +5874,30 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, ...@@ -5873,28 +5874,30 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
#include "stringlib/unicodedefs.h" #include "stringlib/unicodedefs.h"
#include "stringlib/fastsearch.h" #include "stringlib/fastsearch.h"
#include "stringlib/count.h" #include "stringlib/count.h"
/* Include _ParseTupleFinds from find.h */
#define FROM_UNICODE
#include "stringlib/find.h" #include "stringlib/find.h"
#include "stringlib/partition.h" #include "stringlib/partition.h"
#include "stringlib/split.h"
#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
#include "stringlib/localeutil.h" #include "stringlib/localeutil.h"
/* helper macro to fixup start/end slice values */ /* helper macro to fixup start/end slice values */
#define FIX_START_END(obj) \ #define ADJUST_INDICES(start, end, len) \
if (start < 0) \ if (end > len) \
start += (obj)->length; \ end = len; \
else if (end < 0) { \
end += len; \
if (end < 0) \
end = 0; \
} \
if (start < 0) { \
start += len; \
if (start < 0) \ if (start < 0) \
start = 0; \ start = 0; \
if (end > (obj)->length) \ }
end = (obj)->length; \
if (end < 0) \
end += (obj)->length; \
if (end < 0) \
end = 0;
Py_ssize_t PyUnicode_Count(PyObject *str, Py_ssize_t PyUnicode_Count(PyObject *str,
PyObject *substr, PyObject *substr,
...@@ -5914,10 +5917,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str, ...@@ -5914,10 +5917,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
return -1; return -1;
} }
FIX_START_END(str_obj); ADJUST_INDICES(start, end, str_obj->length);
result = stringlib_count( result = stringlib_count(
str_obj->str + start, end - start, sub_obj->str, sub_obj->length str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
PY_SSIZE_T_MAX
); );
Py_DECREF(sub_obj); Py_DECREF(sub_obj);
...@@ -5972,8 +5975,7 @@ int tailmatch(PyUnicodeObject *self, ...@@ -5972,8 +5975,7 @@ int tailmatch(PyUnicodeObject *self,
if (substring->length == 0) if (substring->length == 0)
return 1; return 1;
FIX_START_END(self); ADJUST_INDICES(start, end, self->length);
end -= substring->length; end -= substring->length;
if (end < start) if (end < start)
return 0; return 0;
...@@ -6314,305 +6316,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self, ...@@ -6314,305 +6316,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
return u; return u;
} }
#define SPLIT_APPEND(data, left, right) \ PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
if (!str) \
goto onError; \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
static
PyObject *split_whitespace(PyUnicodeObject *self,
PyObject *list,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
register const Py_UNICODE *buf = self->str;
for (i = j = 0; i < len; ) {
/* find a token */
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
i++;
j = i;
while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
i++;
if (j < i) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(buf, j, i);
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
i++;
j = i;
}
}
if (j < len) {
SPLIT_APPEND(buf, j, len);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyObject *PyUnicode_Splitlines(PyObject *string,
int keepends)
{ {
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len;
PyObject *list; PyObject *list;
PyObject *str;
Py_UNICODE *data;
string = PyUnicode_FromObject(string); string = PyUnicode_FromObject(string);
if (string == NULL) if (string == NULL)
return NULL; return NULL;
data = PyUnicode_AS_UNICODE(string);
len = PyUnicode_GET_SIZE(string);
list = PyList_New(0);
if (!list)
goto onError;
for (i = j = 0; i < len; ) {
Py_ssize_t eol;
/* Find a line and append it */
while (i < len && !BLOOM_LINEBREAK(data[i]))
i++;
/* Skip the line break reading CRLF as one line break */
eol = i;
if (i < len) {
if (data[i] == '\r' && i + 1 < len &&
data[i+1] == '\n')
i += 2;
else
i++;
if (keepends)
eol = i;
}
SPLIT_APPEND(data, j, eol);
j = i;
}
if (j < len) {
SPLIT_APPEND(data, j, len);
}
Py_DECREF(string); list = stringlib_splitlines(
return list; (PyObject*) string, PyUnicode_AS_UNICODE(string),
PyUnicode_GET_SIZE(string), keepends);
onError:
Py_XDECREF(list);
Py_DECREF(string); Py_DECREF(string);
return NULL;
}
static
PyObject *split_char(PyUnicodeObject *self,
PyObject *list,
Py_UNICODE ch,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
register const Py_UNICODE *buf = self->str;
for (i = j = 0; i < len; ) {
if (buf[i] == ch) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(buf, j, i);
i = j = i + 1;
} else
i++;
}
if (j <= len) {
SPLIT_APPEND(buf, j, len);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
static
PyObject *split_substring(PyUnicodeObject *self,
PyObject *list,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
Py_ssize_t sublen = substring->length;
PyObject *str;
for (i = j = 0; i <= len - sublen; ) {
if (Py_UNICODE_MATCH(self, i, substring)) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(self->str, j, i);
i = j = i + sublen;
} else
i++;
}
if (j <= len) {
SPLIT_APPEND(self->str, j, len);
}
return list; return list;
onError:
Py_DECREF(list);
return NULL;
} }
static
PyObject *rsplit_whitespace(PyUnicodeObject *self,
PyObject *list,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
register const Py_UNICODE *buf = self->str;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
i--;
j = i;
while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(buf, i + 1, j + 1);
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
i--;
j = i;
}
}
if (j >= 0) {
SPLIT_APPEND(buf, 0, j + 1);
}
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
static
PyObject *rsplit_char(PyUnicodeObject *self,
PyObject *list,
Py_UNICODE ch,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
PyObject *str;
register const Py_UNICODE *buf = self->str;
for (i = j = len - 1; i >= 0; ) {
if (buf[i] == ch) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(buf, i + 1, j + 1);
j = i = i - 1;
} else
i--;
}
if (j >= -1) {
SPLIT_APPEND(buf, 0, j + 1);
}
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
static
PyObject *rsplit_substring(PyUnicodeObject *self,
PyObject *list,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
register Py_ssize_t i;
register Py_ssize_t j;
Py_ssize_t len = self->length;
Py_ssize_t sublen = substring->length;
PyObject *str;
for (i = len - sublen, j = len; i >= 0; ) {
if (Py_UNICODE_MATCH(self, i, substring)) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(self->str, i + sublen, j);
j = i;
i -= sublen;
} else
i--;
}
if (j >= 0) {
SPLIT_APPEND(self->str, 0, j);
}
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
#undef SPLIT_APPEND
static static
PyObject *split(PyUnicodeObject *self, PyObject *split(PyUnicodeObject *self,
PyUnicodeObject *substring, PyUnicodeObject *substring,
Py_ssize_t maxcount) Py_ssize_t maxcount)
{ {
PyObject *list;
if (maxcount < 0) if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX; maxcount = PY_SSIZE_T_MAX;
list = PyList_New(0);
if (!list)
return NULL;
if (substring == NULL) if (substring == NULL)
return split_whitespace(self,list,maxcount); return stringlib_split_whitespace(
(PyObject*) self, self->str, self->length, maxcount
else if (substring->length == 1) );
return split_char(self,list,substring->str[0],maxcount);
else if (substring->length == 0) { return stringlib_split(
Py_DECREF(list); (PyObject*) self, self->str, self->length,
PyErr_SetString(PyExc_ValueError, "empty separator"); substring->str, substring->length,
return NULL; maxcount
} );
else
return split_substring(self,list,substring,maxcount);
} }
static static
...@@ -6620,28 +6357,19 @@ PyObject *rsplit(PyUnicodeObject *self, ...@@ -6620,28 +6357,19 @@ PyObject *rsplit(PyUnicodeObject *self,
PyUnicodeObject *substring, PyUnicodeObject *substring,
Py_ssize_t maxcount) Py_ssize_t maxcount)
{ {
PyObject *list;
if (maxcount < 0) if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX; maxcount = PY_SSIZE_T_MAX;
list = PyList_New(0);
if (!list)
return NULL;
if (substring == NULL) if (substring == NULL)
return rsplit_whitespace(self,list,maxcount); return stringlib_rsplit_whitespace(
(PyObject*) self, self->str, self->length, maxcount
else if (substring->length == 1) );
return rsplit_char(self,list,substring->str[0],maxcount);
else if (substring->length == 0) { return stringlib_rsplit(
Py_DECREF(list); (PyObject*) self, self->str, self->length,
PyErr_SetString(PyExc_ValueError, "empty separator"); substring->str, substring->length,
return NULL; maxcount
} );
else
return rsplit_substring(self,list,substring,maxcount);
} }
static static
...@@ -6654,9 +6382,13 @@ PyObject *replace(PyUnicodeObject *self, ...@@ -6654,9 +6382,13 @@ PyObject *replace(PyUnicodeObject *self,
if (maxcount < 0) if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX; maxcount = PY_SSIZE_T_MAX;
else if (maxcount == 0 || self->length == 0)
goto nothing;
if (str1->length == str2->length) { if (str1->length == str2->length) {
/* same length */ /* same length */
if (str1->length == 0)
goto nothing;
Py_ssize_t i; Py_ssize_t i;
if (str1->length == 1) { if (str1->length == 1) {
/* replace characters */ /* replace characters */
...@@ -6676,8 +6408,8 @@ PyObject *replace(PyUnicodeObject *self, ...@@ -6676,8 +6408,8 @@ PyObject *replace(PyUnicodeObject *self,
u->str[i] = u2; u->str[i] = u2;
} }
} else { } else {
i = fastsearch( i = stringlib_find(
self->str, self->length, str1->str, str1->length, FAST_SEARCH self->str, self->length, str1->str, str1->length, 0
); );
if (i < 0) if (i < 0)
goto nothing; goto nothing;
...@@ -6685,14 +6417,20 @@ PyObject *replace(PyUnicodeObject *self, ...@@ -6685,14 +6417,20 @@ PyObject *replace(PyUnicodeObject *self,
if (!u) if (!u)
return NULL; return NULL;
Py_UNICODE_COPY(u->str, self->str, self->length); Py_UNICODE_COPY(u->str, self->str, self->length);
while (i <= self->length - str1->length)
if (Py_UNICODE_MATCH(self, i, str1)) { /* change everything in-place, starting with this one */
if (--maxcount < 0) Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
i += str1->length;
while ( --maxcount > 0) {
i = stringlib_find(self->str+i, self->length-i,
str1->str, str1->length,
i);
if (i == -1)
break; break;
Py_UNICODE_COPY(u->str+i, str2->str, str2->length); Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
i += str1->length; i += str1->length;
} else }
i++;
} }
} else { } else {
...@@ -6701,9 +6439,8 @@ PyObject *replace(PyUnicodeObject *self, ...@@ -6701,9 +6439,8 @@ PyObject *replace(PyUnicodeObject *self,
Py_UNICODE *p; Py_UNICODE *p;
/* replace strings */ /* replace strings */
n = stringlib_count(self->str, self->length, str1->str, str1->length); n = stringlib_count(self->str, self->length, str1->str, str1->length,
if (n > maxcount) maxcount);
n = maxcount;
if (n == 0) if (n == 0)
goto nothing; goto nothing;
/* new_size = self->length + n * (str2->length - str1->length)); */ /* new_size = self->length + n * (str2->length - str1->length)); */
...@@ -6733,15 +6470,12 @@ PyObject *replace(PyUnicodeObject *self, ...@@ -6733,15 +6470,12 @@ PyObject *replace(PyUnicodeObject *self,
if (str1->length > 0) { if (str1->length > 0) {
while (n-- > 0) { while (n-- > 0) {
/* look for next match */ /* look for next match */
j = i; j = stringlib_find(self->str+i, self->length-i,
while (j <= e) { str1->str, str1->length,
if (Py_UNICODE_MATCH(self, j, str1)) i);
break; if (j == -1)
j++;
}
if (j > i) {
if (j > e)
break; break;
else if (j > i) {
/* copy unchanged part [i:j] */ /* copy unchanged part [i:j] */
Py_UNICODE_COPY(p, self->str+i, j-i); Py_UNICODE_COPY(p, self->str+i, j-i);
p += j - i; p += j - i;
...@@ -7192,11 +6926,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args) ...@@ -7192,11 +6926,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
if (substring == NULL) if (substring == NULL)
return NULL; return NULL;
FIX_START_END(self); ADJUST_INDICES(start, end, self->length);
result = PyLong_FromSsize_t( result = PyLong_FromSsize_t(
stringlib_count(self->str + start, end - start, stringlib_count(self->str + start, end - start,
substring->str, substring->length) substring->str, substring->length,
PY_SSIZE_T_MAX)
); );
Py_DECREF(substring); Py_DECREF(substring);
...@@ -10066,11 +9800,3 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) ...@@ -10066,11 +9800,3 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/
...@@ -1490,6 +1490,10 @@ ...@@ -1490,6 +1490,10 @@
RelativePath="..\..\Objects\sliceobject.c" RelativePath="..\..\Objects\sliceobject.c"
> >
</File> </File>
<File
RelativePath="..\..\Objects\stringlib\split.h"
>
</File>
<File <File
RelativePath="..\..\Objects\structseq.c" RelativePath="..\..\Objects\structseq.c"
> >
......
...@@ -1495,6 +1495,10 @@ ...@@ -1495,6 +1495,10 @@
RelativePath="..\Objects\sliceobject.c" RelativePath="..\Objects\sliceobject.c"
> >
</File> </File>
<File
RelativePath="..\Objects\stringlib\split.h"
>
</File>
<File <File
RelativePath="..\Objects\structseq.c" RelativePath="..\Objects\structseq.c"
> >
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment