Issue #22578: Added attributes to the re.error class.

ad446d57 · Serhiy Storchaka · eb99e515 · ad446d57 · ad446d57 · ad446d57
Commit ad446d57 authored Nov 10, 2014 by Serhiy Storchaka
5 changed files
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -733,13 +733,36 @@ form.
   Clear the regular expression cache.
-.. exception:: error
+.. exception:: error(msg, pattern=None, pos=None)
   Exception raised when a string passed to one of the functions here is not a
   valid regular expression (for example, it might contain unmatched parentheses)
   or when some other error occurs during compilation or matching.  It is never an
-   error if a string contains no match for a pattern.
+   error if a string contains no match for a pattern.  The error instance has
+   the following additional attributes:
+   .. attribute:: msg
+      The unformatted error message.
+   .. attribute:: pattern
+      The regular expression pattern.
+   .. attribute:: pos
+      The index of *pattern* where compilation failed.
+   .. attribute:: lineno
+      The line corresponding to *pos*.
+   .. attribute:: colno
+      The column corresponding to *pos*.
+   .. versionchanged:: 3.5
+      Added additional attributes.
 .. _re-objects:

--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -21,7 +21,35 @@ from _sre import MAXREPEAT, MAXGROUPS
 # should this really be here?
 class error(Exception):
-    pass
+    def __init__(self, msg, pattern=None, pos=None):
+        self.msg = msg
+        self.pattern = pattern
+        self.pos = pos
+        if pattern is not None and pos is not None:
+            msg = '%s at position %d' % (msg, pos)
+            if isinstance(pattern, str):
+                newline = '\n'
+            else:
+                newline = b'\n'
+            self.lineno = pattern.count(newline, 0, pos) + 1
+            self.colno = pos - pattern.rfind(newline, 0, pos)
+            if newline in pattern:
+                msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
+        else:
+            self.lineno = self.colno = None
+        super().__init__(msg)
+def linecol(doc, pos):
+    if isinstance(pattern, str):
+        newline = '\n'
+    else:
+        newline = b'\n'
+    lineno = pattern.count(newline, 0, pos) + 1
+    if lineno == 1:
+        colno = pos + 1
+    else:
+        colno = pos - doc.rindex(newline, 0, pos)
+    return lineno, colno
 class _NamedIntConstant(int):

--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -81,8 +81,8 @@ class Pattern:
        if name is not None:
            ogid = self.groupdict.get(name, None)
            if ogid is not None:
-                raise error("redefinition of group name %s as group %d; "
+                raise error("redefinition of group name %r as group %d; "
-                            "was group %d" % (repr(name), gid,  ogid))
+                            "was group %d" % (name, gid,  ogid))
            self.groupdict[name] = gid
        return gid
    def closegroup(self, gid, p):
@@ -206,24 +206,25 @@ class SubPattern:
 class Tokenizer:
    def __init__(self, string):
        self.istext = isinstance(string, str)
+        self.string = string
        if not self.istext:
            string = str(string, 'latin1')
-        self.string = string
+        self.decoded_string = string
        self.index = 0
        self.__next()
    def __next(self):
        index = self.index
        try:
-            char = self.string[index]
+            char = self.decoded_string[index]
        except IndexError:
            self.next = None
            return
        if char == "\\":
            index += 1
            try:
-                char += self.string[index]
+                char += self.decoded_string[index]
            except IndexError:
-                raise error("bogus escape (end of line)")
+                raise self.error("bogus escape (end of line)") from None
        self.index = index + 1
        self.next = char
    def match(self, char):
@@ -250,15 +251,19 @@ class Tokenizer:
            c = self.next
            self.__next()
            if c is None:
-                raise error("unterminated name")
+                raise self.error("unterminated name")
            if c == terminator:
                break
            result += c
        return result
    def tell(self):
-        return self.index, self.next
+        return self.index - len(self.next or '')
    def seek(self, index):
-        self.index, self.next = index
+        self.index = index
+        self.__next()
+    def error(self, msg, offset=0):
+        return error(msg, self.string, self.tell() - offset)
 # The following three functions are not used in this module anymore, but we keep
 # them here (with DeprecationWarnings) for backwards compatibility.
@@ -322,8 +327,8 @@ def _class_escape(source, escape):
            escape += source.getwhile(2, OCTDIGITS)
            c = int(escape[1:], 8)
            if c > 0o377:
-                raise error('octal escape value %r outside of '
+                raise source.error('octal escape value %r outside of '
-                            'range 0-0o377' % escape)
+                                   'range 0-0o377' % escape, len(escape))
            return LITERAL, c
        elif c in DIGITS:
            raise ValueError
@@ -331,7 +336,7 @@ def _class_escape(source, escape):
            return LITERAL, ord(escape[1])
    except ValueError:
        pass
-    raise error("bogus escape: %s" % repr(escape))
+    raise source.error("bogus escape: %r" % escape, len(escape))
 def _escape(source, escape, state):
    # handle escape code in expression
@@ -377,21 +382,23 @@ def _escape(source, escape, state):
                    escape += source.get()
                    c = int(escape[1:], 8)
                    if c > 0o377:
-                        raise error('octal escape value %r outside of '
+                        raise source.error('octal escape value %r outside of '
-                                    'range 0-0o377' % escape)
+                                           'range 0-0o377' % escape,
+                                           len(escape))
                    return LITERAL, c
            # not an octal escape, so this is a group reference
            group = int(escape[1:])
            if group < state.groups:
                if not state.checkgroup(group):
-                    raise error("cannot refer to open group")
+                    raise source.error("cannot refer to open group",
+                                       len(escape))
                return GROUPREF, group
            raise ValueError
        if len(escape) == 2:
            return LITERAL, ord(escape[1])
    except ValueError:
        pass
-    raise error("bogus escape: %s" % repr(escape))
+    raise source.error("bogus escape: %r" % escape, len(escape))
 def _parse_sub(source, state, nested=True):
    # parse an alternation: a|b|c
@@ -404,7 +411,7 @@ def _parse_sub(source, state, nested=True):
        if not sourcematch("|"):
            break
    if nested and source.next is not None and source.next != ")":
-        raise error("pattern not properly closed")
+        raise source.error("pattern not properly closed")
    if len(items) == 1:
        return items[0]
@@ -449,11 +456,11 @@ def _parse_sub_cond(source, state, condgroup):
    if source.match("|"):
        item_no = _parse(source, state)
        if source.next == "|":
-            raise error("conditional backref with more than two branches")
+            raise source.error("conditional backref with more than two branches")
    else:
        item_no = None
    if source.next is not None and source.next != ")":
-        raise error("pattern not properly closed")
+        raise source.error("pattern not properly closed")
    subpattern = SubPattern(state)
    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    return subpattern
@@ -510,7 +517,7 @@ def _parse(source, state):
            while True:
                this = sourceget()
                if this is None:
-                    raise error("unexpected end of regular expression")
+                    raise source.error("unexpected end of regular expression")
                if this == "]" and set != start:
                    break
                elif this[0] == "\\":
@@ -521,7 +528,7 @@ def _parse(source, state):
                    # potential range
                    this = sourceget()
                    if this is None:
-                        raise error("unexpected end of regular expression")
+                        raise source.error("unexpected end of regular expression")
                    if this == "]":
                        if code1[0] is IN:
                            code1 = code1[1][0]
@@ -533,11 +540,11 @@ def _parse(source, state):
                    else:
                        code2 = LITERAL, _ord(this)
                    if code1[0] != LITERAL or code2[0] != LITERAL:
-                        raise error("bad character range")
+                        raise source.error("bad character range", len(this))
                    lo = code1[1]
                    hi = code2[1]
                    if hi < lo:
-                        raise error("bad character range")
+                        raise source.error("bad character range", len(this))
                    setappend((RANGE, (lo, hi)))
                else:
                    if code1[0] is IN:
@@ -555,6 +562,7 @@ def _parse(source, state):
        elif this in REPEAT_CHARS:
            # repeat previous item
+            here = source.tell()
            if this == "?":
                min, max = 0, 1
            elif this == "*":
@@ -566,7 +574,6 @@ def _parse(source, state):
                if source.next == "}":
                    subpatternappend((LITERAL, _ord(this)))
                    continue
-                here = source.tell()
                min, max = 0, MAXREPEAT
                lo = hi = ""
                while source.next in DIGITS:
@@ -589,18 +596,21 @@ def _parse(source, state):
                    if max >= MAXREPEAT:
                        raise OverflowError("the repetition number is too large")
                    if max < min:
-                        raise error("bad repeat interval")
+                        raise source.error("bad repeat interval",
+                                           source.tell() - here)
            else:
-                raise error("not supported")
+                raise source.error("not supported", len(this))
            # figure out which item to repeat
            if subpattern:
                item = subpattern[-1:]
            else:
                item = None
            if not item or (_len(item) == 1 and item[0][0] == AT):
-                raise error("nothing to repeat")
+                raise source.error("nothing to repeat",
+                                   source.tell() - here + len(this))
            if item[0][0] in _REPEATCODES:
-                raise error("multiple repeat")
+                raise source.error("multiple repeat",
+                                   source.tell() - here + len(this))
            if sourcematch("?"):
                subpattern[-1] = (MIN_REPEAT, (min, max, item))
            else:
@@ -618,7 +628,7 @@ def _parse(source, state):
                # options
                char = sourceget()
                if char is None:
-                    raise error("unexpected end of pattern")
+                    raise self.error("unexpected end of pattern")
                if char == "P":
                    # python extensions
                    if sourcematch("<"):
@@ -626,28 +636,32 @@ def _parse(source, state):
                        name = source.getuntil(">")
                        group = 1
                        if not name:
-                            raise error("missing group name")
+                            raise source.error("missing group name", 1)
                        if not name.isidentifier():
-                            raise error("bad character in group name %r" % name)
+                            raise source.error("bad character in group name "
+                                               "%r" % name,
+                                               len(name) + 1)
                    elif sourcematch("="):
                        # named backreference
                        name = source.getuntil(")")
                        if not name:
-                            raise error("missing group name")
+                            raise source.error("missing group name", 1)
                        if not name.isidentifier():
-                            raise error("bad character in backref group name "
+                            raise source.error("bad character in backref "
-                                        "%r" % name)
+                                               "group name %r" % name,
+                                               len(name) + 1)
                        gid = state.groupdict.get(name)
                        if gid is None:
                            msg = "unknown group name: {0!r}".format(name)
-                            raise error(msg)
+                            raise source.error(msg, len(name) + 1)
                        subpatternappend((GROUPREF, gid))
                        continue
                    else:
                        char = sourceget()
                        if char is None:
-                            raise error("unexpected end of pattern")
+                            raise source.error("unexpected end of pattern")
-                        raise error("unknown specifier: ?P%s" % char)
+                        raise source.error("unknown specifier: ?P%s" % char,
+                                           len(char))
                elif char == ":":
                    # non-capturing group
                    group = 2
@@ -655,7 +669,7 @@ def _parse(source, state):
                    # comment
                    while True:
                        if source.next is None:
-                            raise error("unbalanced parenthesis")
+                            raise source.error("unbalanced parenthesis")
                        if sourceget() == ")":
                            break
                    continue
@@ -665,11 +679,11 @@ def _parse(source, state):
                    if char == "<":
                        char = sourceget()
                        if char is None or char not in "=!":
-                            raise error("syntax error")
+                            raise source.error("syntax error")
                        dir = -1 # lookbehind
                    p = _parse_sub(source, state)
                    if not sourcematch(")"):
-                        raise error("unbalanced parenthesis")
+                        raise source.error("unbalanced parenthesis")
                    if char == "=":
                        subpatternappend((ASSERT, (dir, p)))
                    else:
@@ -680,23 +694,26 @@ def _parse(source, state):
                    condname = source.getuntil(")")
                    group = 2
                    if not condname:
-                        raise error("missing group name")
+                        raise source.error("missing group name", 1)
                    if condname.isidentifier():
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
                            msg = "unknown group name: {0!r}".format(condname)
-                            raise error(msg)
+                            raise source.error(msg, len(condname) + 1)
                    else:
                        try:
                            condgroup = int(condname)
                            if condgroup < 0:
                                raise ValueError
                        except ValueError:
-                            raise error("bad character in group name")
+                            raise source.error("bad character in group name",
+                                               len(condname) + 1)
                        if not condgroup:
-                            raise error("bad group number")
+                            raise source.error("bad group number",
+                                               len(condname) + 1)
                        if condgroup >= MAXGROUPS:
-                            raise error("the group number is too large")
+                            raise source.error("the group number is too large",
+                                               len(condname) + 1)
                elif char in FLAGS:
                    # flags
                    state.flags |= FLAGS[char]
@@ -704,20 +721,23 @@ def _parse(source, state):
                        state.flags |= FLAGS[sourceget()]
                    verbose = state.flags & SRE_FLAG_VERBOSE
                else:
-                    raise error("unexpected end of pattern " + char)
+                    raise source.error("unexpected end of pattern")
            if group:
                # parse group contents
                if group == 2:
                    # anonymous group
                    group = None
                else:
+                    try:
                        group = state.opengroup(name)
+                    except error as err:
+                        raise source.error(err.msg, len(name) + 1)
                if condgroup:
                    p = _parse_sub_cond(source, state, condgroup)
                else:
                    p = _parse_sub(source, state)
                if not sourcematch(")"):
-                    raise error("unbalanced parenthesis")
+                    raise source.error("unbalanced parenthesis")
                if group is not None:
                    state.closegroup(group, p)
                subpatternappend((SUBPATTERN, (group, p)))
@@ -725,10 +745,10 @@ def _parse(source, state):
                while True:
                    char = sourceget()
                    if char is None:
-                        raise error("unexpected end of pattern")
+                        raise source.error("unexpected end of pattern")
                    if char == ")":
                        break
-                    raise error("unknown extension")
+                    raise source.error("unknown extension", len(char))
        elif this == "^":
            subpatternappend((AT, AT_BEGINNING))
@@ -737,7 +757,7 @@ def _parse(source, state):
            subpattern.append((AT, AT_END))
        else:
-            raise error("parser error")
+            raise source.error("parser error", len(this))
    return subpattern
@@ -768,9 +788,10 @@ def parse(str, flags=0, pattern=None):
    if source.next is not None:
        if source.next == ")":
-            raise error("unbalanced parenthesis")
+            raise source.error("unbalanced parenthesis")
        else:
-            raise error("bogus characters at end of regular expression")
+            raise source.error("bogus characters at end of regular expression",
+                               len(tail))
    if flags & SRE_FLAG_DEBUG:
        p.dump()
@@ -809,16 +830,18 @@ def parse_template(source, pattern):
                if s.match("<"):
                    name = s.getuntil(">")
                if not name:
-                    raise error("missing group name")
+                    raise s.error("missing group name", 1)
                try:
                    index = int(name)
                    if index < 0:
-                        raise error("negative group number")
+                        raise s.error("negative group number", len(name) + 1)
                    if index >= MAXGROUPS:
-                        raise error("the group number is too large")
+                        raise s.error("the group number is too large",
+                                      len(name) + 1)
                except ValueError:
                    if not name.isidentifier():
-                        raise error("bad character in group name")
+                        raise s.error("bad character in group name",
+                                      len(name) + 1)
                    try:
                        index = pattern.groupindex[name]
                    except KeyError:
@@ -841,8 +864,8 @@ def parse_template(source, pattern):
                        isoctal = True
                        c = int(this[1:], 8)
                        if c > 0o377:
-                            raise error('octal escape value %r outside of '
+                            raise s.error('octal escape value %r outside of '
-                                        'range 0-0o377' % this)
+                                          'range 0-0o377' % this, len(this))
                        lappend(chr(c))
                if not isoctal:
                    addgroup(int(this[1:]))

--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1419,6 +1419,42 @@ SUBPATTERN None
        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
+    def test_error(self):
+        with self.assertRaises(re.error) as cm:
+            re.compile('(\u20ac))')
+        err = cm.exception
+        self.assertIsInstance(err.pattern, str)
+        self.assertEqual(err.pattern, '(\u20ac))')
+        self.assertEqual(err.pos, 3)
+        self.assertEqual(err.lineno, 1)
+        self.assertEqual(err.colno, 4)
+        self.assertIn(err.msg, str(err))
+        self.assertIn(' at position 3', str(err))
+        self.assertNotIn(' at position 3', err.msg)
+        # Bytes pattern
+        with self.assertRaises(re.error) as cm:
+            re.compile(b'(\xa4))')
+        err = cm.exception
+        self.assertIsInstance(err.pattern, bytes)
+        self.assertEqual(err.pattern, b'(\xa4))')
+        self.assertEqual(err.pos, 3)
+        # Multiline pattern
+        with self.assertRaises(re.error) as cm:
+            re.compile("""
+                (
+                    abc
+                )
+                )
+                (
+                """, re.VERBOSE)
+        err = cm.exception
+        self.assertEqual(err.pos, 77)
+        self.assertEqual(err.lineno, 5)
+        self.assertEqual(err.colno, 17)
+        self.assertIn(err.msg, str(err))
+        self.assertIn(' at position 77', str(err))
+        self.assertIn('(line 5, column 17)', str(err))
 class PatternReprTests(unittest.TestCase):
    def check(self, pattern, expected):

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -183,6 +183,8 @@ Core and Builtins
 Library
 -------
+- Issue #22578: Added attributes to the re.error class.
 - Issue #12728: Different Unicode characters having the same uppercase but
  different lowercase are now matched in case-insensitive regular expressions.