Commit 5bb282d5 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #19365: Optimized the parsing of long replacement string in re.sub*()

functions.
parent 97bb27d5
...@@ -769,35 +769,33 @@ def parse_template(source, pattern): ...@@ -769,35 +769,33 @@ def parse_template(source, pattern):
# group references # group references
s = Tokenizer(source) s = Tokenizer(source)
sget = s.get sget = s.get
p = [] groups = []
a = p.append literals = []
def literal(literal, p=p, pappend=a): literal = []
if p and p[-1][0] is LITERAL: lappend = literal.append
p[-1] = LITERAL, p[-1][1] + literal def addgroup(index):
else: if literal:
pappend((LITERAL, literal)) literals.append(''.join(literal))
sep = source[:0] del literal[:]
if isinstance(sep, str): groups.append((len(literals), index))
makechar = chr literals.append(None)
else: while True:
makechar = chr
while 1:
this = sget() this = sget()
if this is None: if this is None:
break # end of replacement string break # end of replacement string
if this and this[0] == "\\": if this[0] == "\\":
# group # group
c = this[1:2] c = this[1]
if c == "g": if c == "g":
name = "" name = ""
if s.match("<"): if s.match("<"):
while 1: while True:
char = sget() char = sget()
if char is None: if char is None:
raise error("unterminated group name") raise error("unterminated group name")
if char == ">": if char == ">":
break break
name = name + char name += char
if not name: if not name:
raise error("missing group name") raise error("missing group name")
try: try:
...@@ -811,50 +809,38 @@ def parse_template(source, pattern): ...@@ -811,50 +809,38 @@ def parse_template(source, pattern):
index = pattern.groupindex[name] index = pattern.groupindex[name]
except KeyError: except KeyError:
raise IndexError("unknown group name") raise IndexError("unknown group name")
a((MARK, index)) addgroup(index)
elif c == "0": elif c == "0":
if s.next in OCTDIGITS: if s.next in OCTDIGITS:
this = this + sget() this += sget()
if s.next in OCTDIGITS: if s.next in OCTDIGITS:
this = this + sget() this += sget()
literal(makechar(int(this[1:], 8) & 0xff)) lappend(chr(int(this[1:], 8) & 0xff))
elif c in DIGITS: elif c in DIGITS:
isoctal = False isoctal = False
if s.next in DIGITS: if s.next in DIGITS:
this = this + sget() this += sget()
if (c in OCTDIGITS and this[2] in OCTDIGITS and if (c in OCTDIGITS and this[2] in OCTDIGITS and
s.next in OCTDIGITS): s.next in OCTDIGITS):
this = this + sget() this += sget()
isoctal = True isoctal = True
literal(makechar(int(this[1:], 8) & 0xff)) lappend(chr(int(this[1:], 8) & 0xff))
if not isoctal: if not isoctal:
a((MARK, int(this[1:]))) addgroup(int(this[1:]))
else: else:
try: try:
this = makechar(ESCAPES[this][1]) this = chr(ESCAPES[this][1])
except KeyError: except KeyError:
pass pass
literal(this) lappend(this)
else: else:
literal(this) lappend(this)
# convert template to groups and literals lists if literal:
i = 0 literals.append(''.join(literal))
groups = [] if not isinstance(source, str):
groupsappend = groups.append
literals = [None] * len(p)
if isinstance(source, str):
encode = lambda x: x
else:
# The tokenizer implicitly decodes bytes objects as latin-1, we must # The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation. # therefore re-encode the final representation.
encode = lambda x: x.encode('latin-1') literals = [None if s is None else s.encode('latin-1') for s in literals]
for c, s in p:
if c is MARK:
groupsappend((i, s))
# literal[i] is already None
else:
literals[i] = encode(s)
i = i + 1
return groups, literals return groups, literals
def expand_template(template, match): def expand_template(template, match):
......
...@@ -19,6 +19,9 @@ Core and Builtins ...@@ -19,6 +19,9 @@ Core and Builtins
Library Library
------- -------
- Issue #19365: Optimized the parsing of long replacement string in re.sub*()
functions.
- Issue #19352: Fix unittest discovery when a module can be reached - Issue #19352: Fix unittest discovery when a module can be reached
through several paths (e.g. under Debian/Ubuntu with virtualenv). through several paths (e.g. under Debian/Ubuntu with virtualenv).
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment