Commit ad446d57 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #22578: Added attributes to the re.error class.

parent eb99e515
...@@ -733,13 +733,36 @@ form. ...@@ -733,13 +733,36 @@ form.
Clear the regular expression cache. Clear the regular expression cache.
.. exception:: error .. exception:: error(msg, pattern=None, pos=None)
Exception raised when a string passed to one of the functions here is not a Exception raised when a string passed to one of the functions here is not a
valid regular expression (for example, it might contain unmatched parentheses) valid regular expression (for example, it might contain unmatched parentheses)
or when some other error occurs during compilation or matching. It is never an or when some other error occurs during compilation or matching. It is never an
error if a string contains no match for a pattern. error if a string contains no match for a pattern. The error instance has
the following additional attributes:
.. attribute:: msg
The unformatted error message.
.. attribute:: pattern
The regular expression pattern.
.. attribute:: pos
The index of *pattern* where compilation failed.
.. attribute:: lineno
The line corresponding to *pos*.
.. attribute:: colno
The column corresponding to *pos*.
.. versionchanged:: 3.5
Added additional attributes.
.. _re-objects: .. _re-objects:
......
...@@ -21,7 +21,35 @@ from _sre import MAXREPEAT, MAXGROUPS ...@@ -21,7 +21,35 @@ from _sre import MAXREPEAT, MAXGROUPS
# should this really be here? # should this really be here?
class error(Exception): class error(Exception):
pass def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
def linecol(doc, pos):
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
lineno = pattern.count(newline, 0, pos) + 1
if lineno == 1:
colno = pos + 1
else:
colno = pos - doc.rindex(newline, 0, pos)
return lineno, colno
class _NamedIntConstant(int): class _NamedIntConstant(int):
......
...@@ -81,8 +81,8 @@ class Pattern: ...@@ -81,8 +81,8 @@ class Pattern:
if name is not None: if name is not None:
ogid = self.groupdict.get(name, None) ogid = self.groupdict.get(name, None)
if ogid is not None: if ogid is not None:
raise error("redefinition of group name %s as group %d; " raise error("redefinition of group name %r as group %d; "
"was group %d" % (repr(name), gid, ogid)) "was group %d" % (name, gid, ogid))
self.groupdict[name] = gid self.groupdict[name] = gid
return gid return gid
def closegroup(self, gid, p): def closegroup(self, gid, p):
...@@ -206,24 +206,25 @@ class SubPattern: ...@@ -206,24 +206,25 @@ class SubPattern:
class Tokenizer: class Tokenizer:
def __init__(self, string): def __init__(self, string):
self.istext = isinstance(string, str) self.istext = isinstance(string, str)
self.string = string
if not self.istext: if not self.istext:
string = str(string, 'latin1') string = str(string, 'latin1')
self.string = string self.decoded_string = string
self.index = 0 self.index = 0
self.__next() self.__next()
def __next(self): def __next(self):
index = self.index index = self.index
try: try:
char = self.string[index] char = self.decoded_string[index]
except IndexError: except IndexError:
self.next = None self.next = None
return return
if char == "\\": if char == "\\":
index += 1 index += 1
try: try:
char += self.string[index] char += self.decoded_string[index]
except IndexError: except IndexError:
raise error("bogus escape (end of line)") raise self.error("bogus escape (end of line)") from None
self.index = index + 1 self.index = index + 1
self.next = char self.next = char
def match(self, char): def match(self, char):
...@@ -250,15 +251,19 @@ class Tokenizer: ...@@ -250,15 +251,19 @@ class Tokenizer:
c = self.next c = self.next
self.__next() self.__next()
if c is None: if c is None:
raise error("unterminated name") raise self.error("unterminated name")
if c == terminator: if c == terminator:
break break
result += c result += c
return result return result
def tell(self): def tell(self):
return self.index, self.next return self.index - len(self.next or '')
def seek(self, index): def seek(self, index):
self.index, self.next = index self.index = index
self.__next()
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)
# The following three functions are not used in this module anymore, but we keep # The following three functions are not used in this module anymore, but we keep
# them here (with DeprecationWarnings) for backwards compatibility. # them here (with DeprecationWarnings) for backwards compatibility.
...@@ -322,8 +327,8 @@ def _class_escape(source, escape): ...@@ -322,8 +327,8 @@ def _class_escape(source, escape):
escape += source.getwhile(2, OCTDIGITS) escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8) c = int(escape[1:], 8)
if c > 0o377: if c > 0o377:
raise error('octal escape value %r outside of ' raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape) 'range 0-0o377' % escape, len(escape))
return LITERAL, c return LITERAL, c
elif c in DIGITS: elif c in DIGITS:
raise ValueError raise ValueError
...@@ -331,7 +336,7 @@ def _class_escape(source, escape): ...@@ -331,7 +336,7 @@ def _class_escape(source, escape):
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
raise error("bogus escape: %s" % repr(escape)) raise source.error("bogus escape: %r" % escape, len(escape))
def _escape(source, escape, state): def _escape(source, escape, state):
# handle escape code in expression # handle escape code in expression
...@@ -377,21 +382,23 @@ def _escape(source, escape, state): ...@@ -377,21 +382,23 @@ def _escape(source, escape, state):
escape += source.get() escape += source.get()
c = int(escape[1:], 8) c = int(escape[1:], 8)
if c > 0o377: if c > 0o377:
raise error('octal escape value %r outside of ' raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape) 'range 0-0o377' % escape,
len(escape))
return LITERAL, c return LITERAL, c
# not an octal escape, so this is a group reference # not an octal escape, so this is a group reference
group = int(escape[1:]) group = int(escape[1:])
if group < state.groups: if group < state.groups:
if not state.checkgroup(group): if not state.checkgroup(group):
raise error("cannot refer to open group") raise source.error("cannot refer to open group",
len(escape))
return GROUPREF, group return GROUPREF, group
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
return LITERAL, ord(escape[1]) return LITERAL, ord(escape[1])
except ValueError: except ValueError:
pass pass
raise error("bogus escape: %s" % repr(escape)) raise source.error("bogus escape: %r" % escape, len(escape))
def _parse_sub(source, state, nested=True): def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c # parse an alternation: a|b|c
...@@ -404,7 +411,7 @@ def _parse_sub(source, state, nested=True): ...@@ -404,7 +411,7 @@ def _parse_sub(source, state, nested=True):
if not sourcematch("|"): if not sourcematch("|"):
break break
if nested and source.next is not None and source.next != ")": if nested and source.next is not None and source.next != ")":
raise error("pattern not properly closed") raise source.error("pattern not properly closed")
if len(items) == 1: if len(items) == 1:
return items[0] return items[0]
...@@ -449,11 +456,11 @@ def _parse_sub_cond(source, state, condgroup): ...@@ -449,11 +456,11 @@ def _parse_sub_cond(source, state, condgroup):
if source.match("|"): if source.match("|"):
item_no = _parse(source, state) item_no = _parse(source, state)
if source.next == "|": if source.next == "|":
raise error("conditional backref with more than two branches") raise source.error("conditional backref with more than two branches")
else: else:
item_no = None item_no = None
if source.next is not None and source.next != ")": if source.next is not None and source.next != ")":
raise error("pattern not properly closed") raise source.error("pattern not properly closed")
subpattern = SubPattern(state) subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern return subpattern
...@@ -510,7 +517,7 @@ def _parse(source, state): ...@@ -510,7 +517,7 @@ def _parse(source, state):
while True: while True:
this = sourceget() this = sourceget()
if this is None: if this is None:
raise error("unexpected end of regular expression") raise source.error("unexpected end of regular expression")
if this == "]" and set != start: if this == "]" and set != start:
break break
elif this[0] == "\\": elif this[0] == "\\":
...@@ -521,7 +528,7 @@ def _parse(source, state): ...@@ -521,7 +528,7 @@ def _parse(source, state):
# potential range # potential range
this = sourceget() this = sourceget()
if this is None: if this is None:
raise error("unexpected end of regular expression") raise source.error("unexpected end of regular expression")
if this == "]": if this == "]":
if code1[0] is IN: if code1[0] is IN:
code1 = code1[1][0] code1 = code1[1][0]
...@@ -533,11 +540,11 @@ def _parse(source, state): ...@@ -533,11 +540,11 @@ def _parse(source, state):
else: else:
code2 = LITERAL, _ord(this) code2 = LITERAL, _ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL: if code1[0] != LITERAL or code2[0] != LITERAL:
raise error("bad character range") raise source.error("bad character range", len(this))
lo = code1[1] lo = code1[1]
hi = code2[1] hi = code2[1]
if hi < lo: if hi < lo:
raise error("bad character range") raise source.error("bad character range", len(this))
setappend((RANGE, (lo, hi))) setappend((RANGE, (lo, hi)))
else: else:
if code1[0] is IN: if code1[0] is IN:
...@@ -555,6 +562,7 @@ def _parse(source, state): ...@@ -555,6 +562,7 @@ def _parse(source, state):
elif this in REPEAT_CHARS: elif this in REPEAT_CHARS:
# repeat previous item # repeat previous item
here = source.tell()
if this == "?": if this == "?":
min, max = 0, 1 min, max = 0, 1
elif this == "*": elif this == "*":
...@@ -566,7 +574,6 @@ def _parse(source, state): ...@@ -566,7 +574,6 @@ def _parse(source, state):
if source.next == "}": if source.next == "}":
subpatternappend((LITERAL, _ord(this))) subpatternappend((LITERAL, _ord(this)))
continue continue
here = source.tell()
min, max = 0, MAXREPEAT min, max = 0, MAXREPEAT
lo = hi = "" lo = hi = ""
while source.next in DIGITS: while source.next in DIGITS:
...@@ -589,18 +596,21 @@ def _parse(source, state): ...@@ -589,18 +596,21 @@ def _parse(source, state):
if max >= MAXREPEAT: if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large") raise OverflowError("the repetition number is too large")
if max < min: if max < min:
raise error("bad repeat interval") raise source.error("bad repeat interval",
source.tell() - here)
else: else:
raise error("not supported") raise source.error("not supported", len(this))
# figure out which item to repeat # figure out which item to repeat
if subpattern: if subpattern:
item = subpattern[-1:] item = subpattern[-1:]
else: else:
item = None item = None
if not item or (_len(item) == 1 and item[0][0] == AT): if not item or (_len(item) == 1 and item[0][0] == AT):
raise error("nothing to repeat") raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES: if item[0][0] in _REPEATCODES:
raise error("multiple repeat") raise source.error("multiple repeat",
source.tell() - here + len(this))
if sourcematch("?"): if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item)) subpattern[-1] = (MIN_REPEAT, (min, max, item))
else: else:
...@@ -618,7 +628,7 @@ def _parse(source, state): ...@@ -618,7 +628,7 @@ def _parse(source, state):
# options # options
char = sourceget() char = sourceget()
if char is None: if char is None:
raise error("unexpected end of pattern") raise self.error("unexpected end of pattern")
if char == "P": if char == "P":
# python extensions # python extensions
if sourcematch("<"): if sourcematch("<"):
...@@ -626,28 +636,32 @@ def _parse(source, state): ...@@ -626,28 +636,32 @@ def _parse(source, state):
name = source.getuntil(">") name = source.getuntil(">")
group = 1 group = 1
if not name: if not name:
raise error("missing group name") raise source.error("missing group name", 1)
if not name.isidentifier(): if not name.isidentifier():
raise error("bad character in group name %r" % name) raise source.error("bad character in group name "
"%r" % name,
len(name) + 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")") name = source.getuntil(")")
if not name: if not name:
raise error("missing group name") raise source.error("missing group name", 1)
if not name.isidentifier(): if not name.isidentifier():
raise error("bad character in backref group name " raise source.error("bad character in backref "
"%r" % name) "group name %r" % name,
len(name) + 1)
gid = state.groupdict.get(name) gid = state.groupdict.get(name)
if gid is None: if gid is None:
msg = "unknown group name: {0!r}".format(name) msg = "unknown group name: {0!r}".format(name)
raise error(msg) raise source.error(msg, len(name) + 1)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
continue continue
else: else:
char = sourceget() char = sourceget()
if char is None: if char is None:
raise error("unexpected end of pattern") raise source.error("unexpected end of pattern")
raise error("unknown specifier: ?P%s" % char) raise source.error("unknown specifier: ?P%s" % char,
len(char))
elif char == ":": elif char == ":":
# non-capturing group # non-capturing group
group = 2 group = 2
...@@ -655,7 +669,7 @@ def _parse(source, state): ...@@ -655,7 +669,7 @@ def _parse(source, state):
# comment # comment
while True: while True:
if source.next is None: if source.next is None:
raise error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if sourceget() == ")": if sourceget() == ")":
break break
continue continue
...@@ -665,11 +679,11 @@ def _parse(source, state): ...@@ -665,11 +679,11 @@ def _parse(source, state):
if char == "<": if char == "<":
char = sourceget() char = sourceget()
if char is None or char not in "=!": if char is None or char not in "=!":
raise error("syntax error") raise source.error("syntax error")
dir = -1 # lookbehind dir = -1 # lookbehind
p = _parse_sub(source, state) p = _parse_sub(source, state)
if not sourcematch(")"): if not sourcematch(")"):
raise error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if char == "=": if char == "=":
subpatternappend((ASSERT, (dir, p))) subpatternappend((ASSERT, (dir, p)))
else: else:
...@@ -680,23 +694,26 @@ def _parse(source, state): ...@@ -680,23 +694,26 @@ def _parse(source, state):
condname = source.getuntil(")") condname = source.getuntil(")")
group = 2 group = 2
if not condname: if not condname:
raise error("missing group name") raise source.error("missing group name", 1)
if condname.isidentifier(): if condname.isidentifier():
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
msg = "unknown group name: {0!r}".format(condname) msg = "unknown group name: {0!r}".format(condname)
raise error(msg) raise source.error(msg, len(condname) + 1)
else: else:
try: try:
condgroup = int(condname) condgroup = int(condname)
if condgroup < 0: if condgroup < 0:
raise ValueError raise ValueError
except ValueError: except ValueError:
raise error("bad character in group name") raise source.error("bad character in group name",
len(condname) + 1)
if not condgroup: if not condgroup:
raise error("bad group number") raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS: if condgroup >= MAXGROUPS:
raise error("the group number is too large") raise source.error("the group number is too large",
len(condname) + 1)
elif char in FLAGS: elif char in FLAGS:
# flags # flags
state.flags |= FLAGS[char] state.flags |= FLAGS[char]
...@@ -704,20 +721,23 @@ def _parse(source, state): ...@@ -704,20 +721,23 @@ def _parse(source, state):
state.flags |= FLAGS[sourceget()] state.flags |= FLAGS[sourceget()]
verbose = state.flags & SRE_FLAG_VERBOSE verbose = state.flags & SRE_FLAG_VERBOSE
else: else:
raise error("unexpected end of pattern " + char) raise source.error("unexpected end of pattern")
if group: if group:
# parse group contents # parse group contents
if group == 2: if group == 2:
# anonymous group # anonymous group
group = None group = None
else: else:
try:
group = state.opengroup(name) group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1)
if condgroup: if condgroup:
p = _parse_sub_cond(source, state, condgroup) p = _parse_sub_cond(source, state, condgroup)
else: else:
p = _parse_sub(source, state) p = _parse_sub(source, state)
if not sourcematch(")"): if not sourcematch(")"):
raise error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if group is not None: if group is not None:
state.closegroup(group, p) state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p))) subpatternappend((SUBPATTERN, (group, p)))
...@@ -725,10 +745,10 @@ def _parse(source, state): ...@@ -725,10 +745,10 @@ def _parse(source, state):
while True: while True:
char = sourceget() char = sourceget()
if char is None: if char is None:
raise error("unexpected end of pattern") raise source.error("unexpected end of pattern")
if char == ")": if char == ")":
break break
raise error("unknown extension") raise source.error("unknown extension", len(char))
elif this == "^": elif this == "^":
subpatternappend((AT, AT_BEGINNING)) subpatternappend((AT, AT_BEGINNING))
...@@ -737,7 +757,7 @@ def _parse(source, state): ...@@ -737,7 +757,7 @@ def _parse(source, state):
subpattern.append((AT, AT_END)) subpattern.append((AT, AT_END))
else: else:
raise error("parser error") raise source.error("parser error", len(this))
return subpattern return subpattern
...@@ -768,9 +788,10 @@ def parse(str, flags=0, pattern=None): ...@@ -768,9 +788,10 @@ def parse(str, flags=0, pattern=None):
if source.next is not None: if source.next is not None:
if source.next == ")": if source.next == ")":
raise error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
else: else:
raise error("bogus characters at end of regular expression") raise source.error("bogus characters at end of regular expression",
len(tail))
if flags & SRE_FLAG_DEBUG: if flags & SRE_FLAG_DEBUG:
p.dump() p.dump()
...@@ -809,16 +830,18 @@ def parse_template(source, pattern): ...@@ -809,16 +830,18 @@ def parse_template(source, pattern):
if s.match("<"): if s.match("<"):
name = s.getuntil(">") name = s.getuntil(">")
if not name: if not name:
raise error("missing group name") raise s.error("missing group name", 1)
try: try:
index = int(name) index = int(name)
if index < 0: if index < 0:
raise error("negative group number") raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS: if index >= MAXGROUPS:
raise error("the group number is too large") raise s.error("the group number is too large",
len(name) + 1)
except ValueError: except ValueError:
if not name.isidentifier(): if not name.isidentifier():
raise error("bad character in group name") raise s.error("bad character in group name",
len(name) + 1)
try: try:
index = pattern.groupindex[name] index = pattern.groupindex[name]
except KeyError: except KeyError:
...@@ -841,8 +864,8 @@ def parse_template(source, pattern): ...@@ -841,8 +864,8 @@ def parse_template(source, pattern):
isoctal = True isoctal = True
c = int(this[1:], 8) c = int(this[1:], 8)
if c > 0o377: if c > 0o377:
raise error('octal escape value %r outside of ' raise s.error('octal escape value %r outside of '
'range 0-0o377' % this) 'range 0-0o377' % this, len(this))
lappend(chr(c)) lappend(chr(c))
if not isoctal: if not isoctal:
addgroup(int(this[1:])) addgroup(int(this[1:]))
......
...@@ -1419,6 +1419,42 @@ SUBPATTERN None ...@@ -1419,6 +1419,42 @@ SUBPATTERN None
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')
err = cm.exception
self.assertIsInstance(err.pattern, str)
self.assertEqual(err.pattern, '(\u20ac))')
self.assertEqual(err.pos, 3)
self.assertEqual(err.lineno, 1)
self.assertEqual(err.colno, 4)
self.assertIn(err.msg, str(err))
self.assertIn(' at position 3', str(err))
self.assertNotIn(' at position 3', err.msg)
# Bytes pattern
with self.assertRaises(re.error) as cm:
re.compile(b'(\xa4))')
err = cm.exception
self.assertIsInstance(err.pattern, bytes)
self.assertEqual(err.pattern, b'(\xa4))')
self.assertEqual(err.pos, 3)
# Multiline pattern
with self.assertRaises(re.error) as cm:
re.compile("""
(
abc
)
)
(
""", re.VERBOSE)
err = cm.exception
self.assertEqual(err.pos, 77)
self.assertEqual(err.lineno, 5)
self.assertEqual(err.colno, 17)
self.assertIn(err.msg, str(err))
self.assertIn(' at position 77', str(err))
self.assertIn('(line 5, column 17)', str(err))
class PatternReprTests(unittest.TestCase): class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected): def check(self, pattern, expected):
......
...@@ -183,6 +183,8 @@ Core and Builtins ...@@ -183,6 +183,8 @@ Core and Builtins
Library Library
------- -------
- Issue #22578: Added attributes to the re.error class.
- Issue #12728: Different Unicode characters having the same uppercase but - Issue #12728: Different Unicode characters having the same uppercase but
different lowercase are now matched in case-insensitive regular expressions. different lowercase are now matched in case-insensitive regular expressions.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment