Commit f22f9e03 authored by Vincent Pelletier's avatar Vincent Pelletier

Relax regexes for quoted fields.

Allows catching more log lines, especially for quoted fields which lack
quote escaping. This is at the expense of some parsing performance (10%
on a random real-worlds sample).
Also simplify the code a bit by removing expensive matching logic.
parent 1684838f
......@@ -932,12 +932,12 @@ logformat_dict = {
'%l': r'(?P<ident>[^ ]*)',
'%u': r'(?P<user>[^ ]*)',
'%t': r'\[(?P<timestamp>\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\]',
'%r': r'(?P<request>[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%r': r'(?P<request>.*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%>s': r'(?P<status>[0-9]*?)',
'%O': r'(?P<size>[0-9-]*?)',
'%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
'%{REMOTE_USER}i': r'(?P<remote_user>[^"]*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in "
'%{REMOTE_USER}i': r'(?P<remote_user>.*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>.*)', # XXX: expected to be enclosed in "
DURATION_US_FORMAT: r'(?P<duration>[0-9]*)',
DURATION_MS_FORMAT: r'(?P<duration_ms>[0-9]*)',
DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)',
......@@ -947,14 +947,6 @@ logformat_dict = {
# TODO: add more formats
}
# Expensive, but more robust, variants
expensive_logformat_dict = {
'%r': r'(?P<request>(\\.|[^\\"])*)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)',
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)',
'%{REMOTE_USER}i': r'(?P<remote_user>(\\.|[^\\"])*)',
}
REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
'( (?P<protocol>.*))?')
......@@ -1522,12 +1514,11 @@ def main():
get_url_prefix = server_name_group_dict.get(args.match_servername,
lambda _, path: path)
line_regex = ''
expensive_line_regex = ''
try:
n = iter(args.logformat).__next__
while True:
key = None
expensive_char = char = n()
char = n()
if char == '%':
fmt = n()
key = char + fmt
......@@ -1541,13 +1532,10 @@ def main():
# XXX: Consider unknown fields have no whitespaces (ie, support for
# quotes)
char = logformat_dict.get(key, r'\S*')
expensive_char = expensive_logformat_dict.get(key, char)
line_regex += char
expensive_line_regex += expensive_char
except StopIteration:
assert not key, key
matchline = re.compile(line_regex).match
expensive_matchline = re.compile(expensive_line_regex).match
matchrequest = REQUEST_PATTERN.match
if args.period is None:
next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
......@@ -1680,13 +1668,11 @@ def main():
print(lineno, end='\r', file=sys.stderr)
match = matchline(line)
if match is None:
match = expensive_matchline(line)
if match is None:
if not quiet:
print(f'Malformed line at {filename}:{lineno}: {line}',
file=sys.stderr)
malformed_lines += 1
continue
if not quiet:
print(f'Malformed line at {filename}:{lineno}: {line}',
file=sys.stderr)
malformed_lines += 1
continue
agent = match.group('agent')
if any(x(agent) for x in skip_user_agent):
skipped_user_agent += 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment