Commit 95e8053a authored by Guido van Rossum's avatar Guido van Rossum

1.5a3 prerelease 1 from AMK

parent a74ef66a
......@@ -317,10 +317,19 @@ class Eol(Instruction):
class Set(Instruction):
name = 'set'
def __init__(self, set):
def __init__(self, set, flags=0):
self.set = set
Instruction.__init__(self, chr(3), 33)
if flags & IGNORECASE: self.set=map(string.lower, self.set)
if len(set)==1:
# If only one element, use the "exact" opcode (it'll be faster)
Instruction.__init__(self, chr(4), 2)
else:
# Use the "set" opcode
Instruction.__init__(self, chr(3), 33)
def assemble(self, position, labels):
if len(self.set)==1:
# If only one character in set, generate an "exact" opcode
return self.opcode + self.set[0]
result = self.opcode
temp = 0
for i, c in map(lambda x: (x, chr(x)), range(256)):
......@@ -333,14 +342,16 @@ class Set(Instruction):
def __repr__(self):
result = '%-15s' % (self.name)
self.set.sort()
# XXX this should print more intelligently
for char in self.set:
result = result + char
return result
class Exact(Instruction):
name = 'exact'
def __init__(self, char):
def __init__(self, char, flags):
self.char = char
if flags & IGNORECASE: self.char=string.lower(self.char)
Instruction.__init__(self, chr(4), 2)
def assemble(self, position, labels):
return self.opcode + self.char
......@@ -881,7 +892,7 @@ def compile(pattern, flags=0):
escape_type, value, index = expand_escape(pattern, index)
if escape_type == CHAR:
stack.append([Exact(value)])
stack.append([Exact(value, flags)])
lastop = '\\' + value
elif escape_type == MEMORY_REFERENCE:
......@@ -1306,7 +1317,7 @@ def compile(pattern, flags=0):
elif char == '.':
if flags & DOTALL:
stack.append([Set(map(chr, range(256)))])
stack.append([Set(map(chr, range(256)), flags)])
else:
stack.append([AnyChar()])
lastop = '.'
......@@ -1336,12 +1347,12 @@ def compile(pattern, flags=0):
index = end + 1
# do not change lastop
else:
stack.append([Exact(char)])
stack.append([Exact(char, flags)])
lastop = '#'
elif char in string.whitespace:
if not (flags & VERBOSE):
stack.append([Exact(char)])
stack.append([Exact(char, flags)])
lastop = char
elif char == '[':
......@@ -1449,22 +1460,25 @@ def compile(pattern, flags=0):
index = index + 1
if negate:
# If case is being ignored, then both upper- and lowercase
# versions of the letters must be excluded.
if flags & IGNORECASE: set=set+map(string.upper, set)
notset = []
for char in map(chr, range(256)):
if char not in set:
notset.append(char)
if len(notset) == 0:
raise error, 'empty negated set'
stack.append([Set(notset)])
stack.append([Set(notset, flags)])
else:
if len(set) == 0:
raise error, 'empty set'
stack.append([Set(set)])
stack.append([Set(set, flags)])
lastop = '[]'
else:
stack.append([Exact(char)])
stack.append([Exact(char, flags)])
lastop = char
code = []
......@@ -1485,6 +1499,7 @@ def compile(pattern, flags=0):
code.append(Label(label))
label = label + 1
code.append(End())
# print code
return RegexObject(pattern, flags, code, register, groupindex)
# Replace expand_escape and _expand functions with their C equivalents.
......
......@@ -318,6 +318,7 @@ tests = [
# ('((((((((((a))))))))))\\41', 'aa', FAIL),
# ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
('((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
('multiple words of text', 'uh-uh', FAIL),
('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
......@@ -448,7 +449,6 @@ tests = [
('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'),
#('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
#('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'),
('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'),
......@@ -506,10 +506,21 @@ xyzabc
('a.b', 'a\nb', FAIL),
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
# test \w, etc.
# test \w, etc. both inside and outside character classes
('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'),
('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'),
('[\\d-x]', '-', SYNTAX_ERROR),
(r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
(r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
(r'[\t][\n][\v][\r][\f][\a][\A][\b][\B][\Z][\g]', '\t\n\v\r\f\aA\bBZg', SUCCEED, 'found', '\t\n\v\r\f\aA\bBZg'),
]
......@@ -278,6 +278,12 @@ tests = [
('\\([xyz]*\\)x', 'abcx', SUCCEED,
'found+"-"+g1', 'x-'),
('\\(a\\)+b\\|aac', 'aac', SUCCEED,
'found+"-"+g1', 'aac-None')
'found+"-"+g1', 'aac-None'),
('\<a', 'a', SUCCEED, 'found', 'a'),
('\<a', '!', FAIL),
('a\<b', 'ab', FAIL),
('a\>', 'ab', FAIL),
('a\>', 'a!', SUCCEED, 'found', 'a'),
('a\>', 'a', SUCCEED, 'found', 'a'),
]
......@@ -31,6 +31,10 @@ try:
assert re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx') == 'xxxx'
assert re.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a') == '\t\n\v\r\f\a\bBZ\aAwWsSdD'
assert re.sub('a', '\t\n\v\r\f\a', 'a') == '\t\n\v\r\f\a'
assert re.sub('a', '\t\n\v\r\f\a', 'a') == (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))
except AssertionError:
raise TestFailed, "re.sub"
......@@ -120,7 +124,6 @@ if verbose:
print 'Running re_tests test suite'
for t in tests:
print t
sys.stdout.flush()
pattern=s=outcome=repl=expected=None
if len(t)==5:
......@@ -136,6 +139,7 @@ for t in tests:
if outcome==SYNTAX_ERROR: pass # Expected a syntax error
else:
print '=== Syntax error:', t
except KeyboardInterrupt: raise KeyboardInterrupt
except:
print '*** Unexpected error ***'
if verbose:
......@@ -182,3 +186,10 @@ for t in tests:
print repr(repl)+' should be '+repr(expected)
else:
print '=== Failed incorrectly', t
# Try the match with IGNORECASE enabled, and check that it
# still succeeds.
obj=re.compile(pattern, re.IGNORECASE)
result=obj.search(s)
if result==None:
print '=== Fails on case-insensitive match', t
......@@ -132,8 +132,10 @@ regobj_match(re, args)
re->re_lastok = NULL;
result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs);
if (result < -1) {
/* Failure like stack overflow */
PyErr_SetString(RegexError, "match failure");
/* Serious failure of some sort; if re_match didn't
set an exception, raise a generic error */
if (!PyErr_Occurred())
PyErr_SetString(RegexError, "match failure");
return NULL;
}
if (result >= 0) {
......@@ -174,8 +176,10 @@ regobj_search(re, args)
result = re_search(&re->re_patbuf, buffer, size, offset, range,
&re->re_regs);
if (result < -1) {
/* Failure like stack overflow */
PyErr_SetString(RegexError, "match failure");
/* Serious failure of some sort; if re_match didn't
set an exception, raise a generic error */
if (!PyErr_Occurred())
PyErr_SetString(RegexError, "match failure");
return NULL;
}
if (result >= 0) {
......
This diff is collapsed.
......@@ -33,16 +33,16 @@ extern "C" {
typedef struct re_pattern_buffer
{
char *buffer; /* compiled pattern */
unsigned char *buffer; /* compiled pattern */
int allocated; /* allocated size of compiled pattern */
int used; /* actual length of compiled pattern */
char *fastmap; /* fastmap[ch] is true if ch can start pattern */
char *translate; /* translation to apply during compilation/matching */
char fastmap_accurate; /* true if fastmap is valid */
char can_be_null; /* true if can match empty string */
char uses_registers; /* registers are used and need to be initialized */
unsigned char *fastmap; /* fastmap[ch] is true if ch can start pattern */
unsigned char *translate; /* translation to apply during compilation/matching */
unsigned char fastmap_accurate; /* true if fastmap is valid */
unsigned char can_be_null; /* true if can match empty string */
unsigned char uses_registers; /* registers are used and need to be initialized */
int num_registers; /* number of registers used */
char anchor; /* anchor: 0=none 1=begline 2=begbuf */
unsigned char anchor; /* anchor: 0=none 1=begline 2=begbuf */
} *regexp_t;
typedef struct re_registers
......@@ -93,7 +93,7 @@ extern int re_syntax;
/* This is the actual syntax mask. It was added so that Python could do
* syntax-dependent munging of patterns before compilation. */
extern char re_syntax_table[256];
extern unsigned char re_syntax_table[256];
void re_compile_initialize(void);
......@@ -101,7 +101,7 @@ int re_set_syntax(int syntax);
/* This sets the syntax to use and returns the previous syntax. The
* syntax is specified by a bit mask of the above defined bits. */
char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
unsigned char *re_compile_pattern(unsigned char *regex, int regex_size, regexp_t compiled);
/* This compiles the regexp (given in regex and length in regex_size).
* This returns NULL if the regexp compiled successfully, and an error
* message if an error was encountered. The buffer field must be
......@@ -110,14 +110,14 @@ char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
* buffer is NULL). Also, the translate field must be set to point to a
* valid translation table, or NULL if it is not used. */
int re_match(regexp_t compiled, char *string, int size, int pos,
int re_match(regexp_t compiled, unsigned char *string, int size, int pos,
regexp_registers_t old_regs);
/* This tries to match the regexp against the string. This returns the
* length of the matched portion, or -1 if the pattern could not be
* matched and -2 if an error (such as failure stack overflow) is
* encountered. */
int re_search(regexp_t compiled, char *string, int size, int startpos,
int re_search(regexp_t compiled, unsigned char *string, int size, int startpos,
int range, regexp_registers_t regs);
/* This rearches for a substring matching the regexp. This returns the
* first index at which a match is found. range specifies at how many
......@@ -132,28 +132,16 @@ void re_compile_fastmap(regexp_t compiled);
* the calling program must have initialized the fastmap field to point
* to an array of 256 characters. */
char *re_comp(char *s);
/* BSD 4.2 regex library routine re_comp. This compiles the regexp into
* an internal buffer. This returns NULL if the regexp was compiled
* successfully, and an error message if there was an error. */
int re_exec(char *s);
/* BSD 4.2 regexp library routine re_exec. This returns true if the
* string matches the regular expression (that is, a matching part is
* found anywhere in the string). */
#else /* HAVE_PROTOTYPES */
extern int re_syntax;
extern char re_syntax_table[256];
extern unsigned char re_syntax_table[256];
void re_compile_initialize();
int re_set_syntax();
char *re_compile_pattern();
unsigned char *re_compile_pattern();
int re_match();
int re_search();
void re_compile_fastmap();
char *re_comp();
int re_exec();
#endif /* HAVE_PROTOTYPES */
......
......@@ -62,7 +62,7 @@ static PyObject *ReopError; /* Exception */
#define BEGINNING_OF_BUFFER 7
#define END_OF_BUFFER 8
static char *reop_casefold;
static unsigned char *reop_casefold;
static PyObject *
makeresult(regs, num_regs)
......@@ -105,7 +105,7 @@ reop_match(self, args)
PyObject *self;
PyObject *args;
{
char *string;
unsigned char *string;
int fastmaplen, stringlen;
int can_be_null, anchor, i;
int flags, pos, result;
......@@ -163,8 +163,8 @@ reop_match(self, args)
if (result < -1) {
/* Failure like stack overflow */
PyErr_SetString(ReopError, "match failure");
if (!PyErr_Occurred())
PyErr_SetString(ReopError, "match failure");
return NULL;
}
if (result == -1) {
......@@ -174,12 +174,38 @@ reop_match(self, args)
return makeresult(&re_regs, bufp.num_registers);
}
#if 0
static PyObject *
reop_optimize(self, args)
PyObject *self;
PyObject *args;
{
unsigned char *buffer;
int buflen;
struct re_pattern_buffer bufp;
PyObject *opt_code;
if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL;
/* Create a new string for the optimized code */
opt_code=PyString_FromStringAndSize(buffer, buflen);
if (opt_code!=NULL)
{
bufp.buffer = PyString_AsString(opt_code);
bufp.used=bufp.allocated=buflen;
}
return opt_code;
}
#endif
static PyObject *
reop_search(self, args)
PyObject *self;
PyObject *args;
{
char *string;
unsigned char *string;
int fastmaplen, stringlen;
int can_be_null, anchor, i;
int flags, pos, result;
......@@ -237,7 +263,8 @@ reop_search(self, args)
if (result < -1) {
/* Failure like stack overflow */
PyErr_SetString(ReopError, "match failure");
if (!PyErr_Occurred())
PyErr_SetString(ReopError, "match failure");
return NULL;
}
......@@ -626,7 +653,7 @@ reop__expand(self, args)
{
PyObject *results, *match_obj;
PyObject *repl_obj, *newstring;
char *repl;
unsigned char *repl;
int size, total_len, i, start, pos;
if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
......@@ -810,7 +837,7 @@ internal_split(args, retain)
reopobject *pattern;
int maxsplit=0, count=0, length, next=0, result;
int match_end=0; /* match_start is defined below */
char *start;
unsigned char *start;
if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
&maxsplit))
......@@ -911,6 +938,7 @@ static struct PyMethodDef reop_global_methods[] = {
{"expand_escape", reop_expand_escape, 1},
{"_expand", reop__expand, 1},
#if 0
{"_optimize", reop_optimize, 0},
{"split", reop_split, 0},
{"splitx", reop_splitx, 0},
#endif
......@@ -922,8 +950,8 @@ initreop()
{
PyObject *m, *d, *k, *v, *o;
int i;
char *s;
char j[2];
unsigned char *s;
unsigned char j[2];
re_compile_initialize();
......@@ -936,7 +964,7 @@ initreop()
goto finally;
/* Initialize reop.casefold constant */
if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
if (!(v = PyString_FromStringAndSize((unsigned char *)NULL, 256)))
goto finally;
if (!(s = PyString_AsString(v)))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment