Commit f22f9e03 authored by Vincent Pelletier's avatar Vincent Pelletier

Relax regexes for quoted fields.

Allows catching more log lines, especially for quoted fields which lack
quote escaping. This is at the expense of some parsing performance (10%
on a random real-worlds sample).
Also simplify the code a bit by removing expensive matching logic.
parent 1684838f
...@@ -932,12 +932,12 @@ logformat_dict = { ...@@ -932,12 +932,12 @@ logformat_dict = {
'%l': r'(?P<ident>[^ ]*)', '%l': r'(?P<ident>[^ ]*)',
'%u': r'(?P<user>[^ ]*)', '%u': r'(?P<user>[^ ]*)',
'%t': r'\[(?P<timestamp>\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\]', '%t': r'\[(?P<timestamp>\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\]',
'%r': r'(?P<request>[^"]*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN '%r': r'(?P<request>.*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%>s': r'(?P<status>[0-9]*?)', '%>s': r'(?P<status>[0-9]*?)',
'%O': r'(?P<size>[0-9-]*?)', '%O': r'(?P<size>[0-9-]*?)',
'%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in " '%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
'%{REMOTE_USER}i': r'(?P<remote_user>[^"]*)', # XXX: expected to be enclosed in " '%{REMOTE_USER}i': r'(?P<remote_user>.*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in " '%{User-Agent}i': r'(?P<agent>.*)', # XXX: expected to be enclosed in "
DURATION_US_FORMAT: r'(?P<duration>[0-9]*)', DURATION_US_FORMAT: r'(?P<duration>[0-9]*)',
DURATION_MS_FORMAT: r'(?P<duration_ms>[0-9]*)', DURATION_MS_FORMAT: r'(?P<duration_ms>[0-9]*)',
DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)', DURATION_S_FORMAT: r'(?P<duration_s>[0-9]*)',
...@@ -947,14 +947,6 @@ logformat_dict = { ...@@ -947,14 +947,6 @@ logformat_dict = {
# TODO: add more formats # TODO: add more formats
} }
# Expensive, but more robust, variants
expensive_logformat_dict = {
'%r': r'(?P<request>(\\.|[^\\"])*)',
'%{Referer}i': r'(?P<referer>(\\.|[^\\"])*)',
'%{User-Agent}i': r'(?P<agent>(\\.|[^\\"])*)',
'%{REMOTE_USER}i': r'(?P<remote_user>(\\.|[^\\"])*)',
}
REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)' REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
'( (?P<protocol>.*))?') '( (?P<protocol>.*))?')
...@@ -1522,12 +1514,11 @@ def main(): ...@@ -1522,12 +1514,11 @@ def main():
get_url_prefix = server_name_group_dict.get(args.match_servername, get_url_prefix = server_name_group_dict.get(args.match_servername,
lambda _, path: path) lambda _, path: path)
line_regex = '' line_regex = ''
expensive_line_regex = ''
try: try:
n = iter(args.logformat).__next__ n = iter(args.logformat).__next__
while True: while True:
key = None key = None
expensive_char = char = n() char = n()
if char == '%': if char == '%':
fmt = n() fmt = n()
key = char + fmt key = char + fmt
...@@ -1541,13 +1532,10 @@ def main(): ...@@ -1541,13 +1532,10 @@ def main():
# XXX: Consider unknown fields have no whitespaces (ie, support for # XXX: Consider unknown fields have no whitespaces (ie, support for
# quotes) # quotes)
char = logformat_dict.get(key, r'\S*') char = logformat_dict.get(key, r'\S*')
expensive_char = expensive_logformat_dict.get(key, char)
line_regex += char line_regex += char
expensive_line_regex += expensive_char
except StopIteration: except StopIteration:
assert not key, key assert not key, key
matchline = re.compile(line_regex).match matchline = re.compile(line_regex).match
expensive_matchline = re.compile(expensive_line_regex).match
matchrequest = REQUEST_PATTERN.match matchrequest = REQUEST_PATTERN.match
if args.period is None: if args.period is None:
next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
...@@ -1680,13 +1668,11 @@ def main(): ...@@ -1680,13 +1668,11 @@ def main():
print(lineno, end='\r', file=sys.stderr) print(lineno, end='\r', file=sys.stderr)
match = matchline(line) match = matchline(line)
if match is None: if match is None:
match = expensive_matchline(line) if not quiet:
if match is None: print(f'Malformed line at {filename}:{lineno}: {line}',
if not quiet: file=sys.stderr)
print(f'Malformed line at {filename}:{lineno}: {line}', malformed_lines += 1
file=sys.stderr) continue
malformed_lines += 1
continue
agent = match.group('agent') agent = match.group('agent')
if any(x(agent) for x in skip_user_agent): if any(x(agent) for x in skip_user_agent):
skipped_user_agent += 1 skipped_user_agent += 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment