Commit 86c128f2 authored by Vincent Pelletier's avatar Vincent Pelletier

New timing log analyser.

parents
#!/usr/bin/env python
##############################################################################
#
# Copyright (c) 2013 Nexedi SA and Contributors. All Rights Reserved.
# Vincent Pelletier <vincent@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
# TODO:
# - use some templating system instead of hardcoded html strings
# - implement an alternative to matplotlib (inline SVG ?) to have nice output
# with pypy
# - provide some form of raw data output, not just html
from cgi import escape
from collections import defaultdict
from functools import partial
from operator import itemgetter
from urllib import splittype, splithost
import argparse
import gzip
import os
import re
import sys
PARSE_DATE = False
try:
import matplotlib
except ImportError:
matplotlib = None
else:
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot
from matplotlib.dates import DateFormatter, DayLocator
day_formatter = DateFormatter('%Y-%m-%d')
all_days = DayLocator()
PARSE_DATE = True
# When enabled: nicer dates, massively slower
if PARSE_DATE:
from datetime import datetime, tzinfo, timedelta
import locale
locale.setlocale(locale.LC_TIME, 'C')
class TZ(tzinfo):
def __init__(self, tz):
self.name = tz
tz = int(tz)
self.offset = timedelta(0, ((tz / 100 * 60) + (tz % 100)) * 60)
super(TZ, self).__init__()
def dst(self, dt):
return timedelta(0)
def fromutc(self, dt):
# Taken from python's standard library doc, as of python 2.7
dtoff = dt.utcoffset()
dtdst = dt.dst()
delta = dtoff - dtdst
if delta:
dt += delta
dtdst = dt.dst()
if dtdst:
return dt + dtdst
return dt
def tzname(self):
return self.name
def utcoffset(self, dt):
return self.offset
TZ_CACHE = {}
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
US_PER_S = 10 ** 6
N_SLOWEST = 20
N_SLOWEST_THRESHOLD = N_SLOWEST * 4
ITEMGETTER0 = itemgetter(0)
APDEX_TOLERATING_COEF = 4
def statusIsError(status):
return status[0] > '3'
def getClassForDuration(duration, threshold):
if duration <= threshold:
return ''
if duration <= threshold * APDEX_TOLERATING_COEF:
return 'warning'
return 'problem'
def getClassForStatusHit(hit, status):
if hit and statusIsError(status):
return 'problem'
return ''
class APDEXStats(object):
def __init__(self, threshold):
self.threshold = threshold
self.threshold4 = threshold * APDEX_TOLERATING_COEF
self.apdex_1 = 0
self.apdex_4 = 0
self.hit = 0
self.duration_total = 0
self.duration_max = 0
def accumulate(self, match):
duration = int(match.group('duration'))
self.duration_total += duration
self.duration_max = max(self.duration_max, duration)
if not statusIsError(match.group('status')):
if duration <= self.threshold:
self.apdex_1 += 1
elif duration <= self.threshold4:
self.apdex_4 += 1
self.hit += 1
def accumulateFrom(self, other):
for attribute in ('apdex_1', 'apdex_4', 'hit', 'duration_total'):
setattr(self, attribute,
getattr(self, attribute) + getattr(other, attribute))
self.duration_max = max(self.duration_max, other.duration_max)
def getApdex(self):
if self.hit:
return (self.apdex_1 + self.apdex_4 * .5) / self.hit
return 1
def getAverage(self):
if self.hit:
return float(self.duration_total) / self.hit
return 0
class GenericSiteStats(object):
def __init__(self, threshold, prefix=1):
self.threshold = threshold
self.prefix = prefix
self.status = defaultdict(partial(defaultdict, int))
self.slowest_list = [(-1, None, None, None)]
self.daily_apdex = defaultdict(partial(APDEXStats, threshold))
def accumulate(self, match, url_match, utcdate):
self.status[match.group('status')][utcdate] += 1
duration = match.group('duration')
self.daily_apdex[utcdate].accumulate(match)
if duration is not None:
duration = int(duration)
if url_match is None:
url = match.group('request')
else:
url = url_match.group('url')
if duration > self.slowest_list[0][0]:
slowest_list = self.slowest_list
slowest_list.append((duration, match.group('timestamp'), url,
match.group('referer')))
if len(slowest_list) > N_SLOWEST_THRESHOLD:
self._housekeeping()
def _housekeeping(self):
slowest_list = self.slowest_list
slowest_list.sort(key=ITEMGETTER0)
slowest_list[:] = slowest_list[-N_SLOWEST:]
def getApdexData(self):
apdex = APDEXStats(self.threshold)
for data in self.daily_apdex.itervalues():
apdex.accumulateFrom(data)
return [
(date, apdex.getApdex() * 100, apdex.getAverage() / US_PER_S,
float(apdex.duration_max) / US_PER_S, apdex.hit) for date, apdex
in sorted(self.daily_apdex.iteritems(), key=ITEMGETTER0)]
def asHTML(self):
self._housekeeping()
result = []
append = result.append
apdex = APDEXStats(self.threshold)
for data in self.daily_apdex.itervalues():
apdex.accumulateFrom(data)
append('<h2>Overall</h2><p>Hits: %i</p><p>Apdex: %i%%</p>'
'<p>Avg duration (s): %.2f</p><p>Max duration (s): %.2f</p>' % (
apdex.hit,
apdex.getApdex() * 100,
apdex.getAverage() / US_PER_S,
float(apdex.duration_max) / US_PER_S,
))
column_set = set()
for data_dict in self.status.itervalues():
column_set.update(data_dict)
column_list = sorted(column_set)
append('<h2>Hit per status code</h2><table><tr><th>status</th>'
'<th>overall</th>')
for date in column_list:
append('<th>%s</th>' % date)
append('</tr>')
def hitTd(hit, status):
return '<td class="%s">%s</td>' % (getClassForStatusHit(hit, status), hit)
for status, data_dict in sorted(self.status.iteritems(), key=ITEMGETTER0):
append('<tr><td>%s</td>' % status)
append(hitTd(sum(data_dict.itervalues()), status))
for date in column_list:
append(hitTd(data_dict[date], status))
append('</tr>')
append('</table><h2>Slowest pages</h2><table><tr><th>duration (s)</th>'
'<th>date</th><th>url</th><th>referer</th></tr>')
for duration, timestamp, url, referer in reversed(self.slowest_list):
if timestamp is None:
continue
append('<tr><td class="%s">%.2f</td><td>%s</td><td class="text">%s</td><td class="text">%s</td></tr>' % (
getClassForDuration(duration, self.threshold),
float(duration) / US_PER_S,
escape(timestamp),
escape(url),
escape(referer),
))
append('</table>')
return '\n'.join(result)
class ERP5SiteStats(GenericSiteStats):
"""
Heuristic used:
- ignore any GET parameter
- If the first in-site url chunk ends with "_module", count line as
belonging to a module
- If a line belongs to a module and has at least 2 slashes after module,
count line as belonging to a document of that module
"""
def __init__(self, threshold, prefix=1):
super(ERP5SiteStats, self).__init__(threshold, prefix=prefix)
# Key levels:
# - module id (string)
# - date (datetime.date)
# - is document (bool)
self.module = defaultdict(partial(defaultdict, partial(
defaultdict, partial(APDEXStats, threshold))))
def accumulate(self, match, url_match, utcdate):
prefix = self.prefix
split = url_match.group('url').split('?', 1)[0].split('/')[1 + prefix:]
if split:
module = split[0]
if module.endswith('_module'):
super(ERP5SiteStats, self).accumulate(match, url_match, utcdate)
self.module[module][utcdate][len(split) > 2].accumulate(match)
def asHTML(self):
result = []
append = result.append
append('<h2>Per module</h2><table><tr><th rowspan="2" colspan="2">'
'module</th><th colspan="4">overall</th>')
column_set = set()
for data_dict in self.module.itervalues():
column_set.update(data_dict)
column_list = sorted(column_set)
for date in column_list:
append('<th colspan="4">%s</th>' % date)
append('</tr><tr>')
for _ in xrange(len(column_list) + 1):
append('<th>Apdex</th><th>hits</th><th>avg (s)</th>'
'<th>max (s)</th>')
append('</tr>')
def apdexStatsAsHtml(data):
return '<td>%i%%</td><td>%s</td><td class="%s">%.2f</td><td class="%s">%.2f</td>' % (
data.getApdex() * 100,
data.hit,
getClassForDuration(data.getAverage(), self.threshold),
data.getAverage() / US_PER_S,
getClassForDuration(data.duration_max, self.threshold),
float(data.duration_max) / US_PER_S,
)
def apdexAsColumns(caption, is_document, data_dict):
append('<td>%s</td>' % caption)
data_total = APDEXStats(self.threshold)
for data in data_dict.values():
data_total.accumulateFrom(data[is_document])
append(apdexStatsAsHtml(data_total))
for date in column_list:
append(apdexStatsAsHtml(data_dict[date][is_document]))
append('</tr>')
for module_id, data_dict in sorted(self.module.iteritems(),
key=ITEMGETTER0):
append('<tr><td rowspan="2">%s</td>' % module_id)
apdexAsColumns('module', False, data_dict)
append('<tr>')
apdexAsColumns('document', True, data_dict)
append('</table>')
append(super(ERP5SiteStats, self).asHTML())
return '\n'.join(result)
logformat_dict = {
'%h': r'(?P<host>[^ ]*)',
'%l': r'(?P<ident>[^ ]*)',
'%u': r'(?P<user>[^ ]*)',
'%t': r'\[(?P<timestamp>[^\]]*)\]',
'%r': r'(?P<request>(\\.|[^\\"])*)', # XXX: expected to be enclosed in ". See also REQUEST_PATTERN
'%>s': r'(?P<status>[0-9]*?)',
'%O': r'(?P<size>[0-9-]*?)',
'%{Referer}i': r'(?P<referer>[^"]*)', # XXX: expected to be enclosed in "
'%{User-Agent}i': r'(?P<agent>[^"]*)', # XXX: expected to be enclosed in "
'%D': r'(?P<duration>[0-9]*)',
'%%': r'%',
}
REQUEST_PATTERN = re.compile('(?P<method>[^ ]*) (?P<url>[^ ]*)'
'( (?P<protocol>.*))?')
class AggregateSiteUrl(argparse.Action):
__argument_to_aggregator = {
'--base': GenericSiteStats,
'--erp5-base': ERP5SiteStats,
'--skip-base': None,
}
def __call__(self, parser, namespace, values, option_string=None):
action = self.__argument_to_aggregator[option_string]
if action is not None:
if values[-1:] == '/':
offset = -1
else:
offset = 0
action = partial(action, prefix=values.count('/') + offset)
getattr(namespace, self.dest).append((values, action))
def main():
parser = argparse.ArgumentParser(description='Compute Apdex out of log files')
parser.add_argument('logfile', nargs='+',
help='Log files to process')
parser.add_argument('-a', '--apdex', default=US_PER_S, type=int,
help='First threshold for Apdex computation, in microseconds. '
'Default: %(default)r')
parser.add_argument('-d', '--default',
help='Caption for lines matching no prefix, or skip them if not provided.')
parser.add_argument('--base', dest='site_list', default=[],
action=AggregateSiteUrl,
help='Absolute base url of some part of a site (see also '
'--erp5-base).')
parser.add_argument('--erp5-base', dest='site_list',
action=AggregateSiteUrl,
help='Absolute base url of some part of an ERP5 site (extra '
'stats).')
parser.add_argument('--skip-base', dest='site_list',
action=AggregateSiteUrl,
help='Absolute base url to ignore.')
parser.add_argument('-l', '--logformat',
default='%h %l %u %t "%r" %>s %O "%{Referer}i" "%{User-Agent}i" %D',
help='Apache LogFormat used to generate provided logs. '
'Default: %(default)r')
parser.add_argument('-o', '--out', default='.',
help='Directory in which statistic files will be generated. '
'Default: %(default)r')
parser.add_argument('-q', '--quiet', action='store_true',
help='Suppress warnings about malformed lines.')
args = parser.parse_args()
line_regex = ''
try:
n = iter(args.logformat).next
while True:
key = None
char = n()
if char == '%':
fmt = n()
key = char + fmt
if fmt == '{':
while fmt != '}':
fmt = n()
key += fmt
key += n()
elif fmt == '>':
key += n()
char = logformat_dict[key]
line_regex += char
except StopIteration:
assert not key, key
matchline = re.compile(line_regex).match
matchrequest = REQUEST_PATTERN.match
site_list = args.site_list
default_site = args.default
if default_site is None:
default_action = None
else:
default_action = partial(GenericSiteStats, prefix=0)
infile_list = args.logfile
quiet = args.quiet
threshold = args.apdex
file_count = len(infile_list)
per_site = {}
hit_per_day = defaultdict(int)
for fileno, filename in enumerate(infile_list, 1):
print >> sys.stderr, 'Processing %s [%i/%i]' % (
filename, fileno, file_count)
logfile = gzip.open(filename)
try:
logfile.readline()
except IOError:
logfile = open(filename)
else:
logfile.seek(0)
for lineno, line in enumerate(logfile, 1):
if lineno % 5000 == 0:
sys.stderr.write('%i\r' % lineno)
sys.stderr.flush()
match = matchline(line)
if match is None:
if not quiet:
print >> sys.stderr, 'Malformed line at %s:%i: %r' % (
filename, lineno, line)
continue
url_match = matchrequest(match.group('request'))
if url_match is None:
continue
url = url_match.group('url')
if url.startswith('http'):
url = splithost(splittype(url)[1])[1]
startswith = url.startswith
for site, action in site_list:
if startswith(site):
break
else:
site = default_site
action = default_action
if action is None:
continue
timestamp = match.group('timestamp')
dt, tz = timestamp.split(' ')
if PARSE_DATE:
try:
tz = TZ_CACHE[tz]
except KeyError:
tz = TZ_CACHE[tz] = TZ(tz)
date, hour, minute, second = dt.split(':')
day, month, year = date.split('/')
dt = datetime(int(year), MONTH_VALUE_DICT[month], int(day),
int(hour), int(minute), int(second), tzinfo=tz)
utcdate = (dt - dt.utcoffset()).date()
else:
day, month, year = dt.split(':', 1)[0].split('/')
utcdate = '%s/%02i/%s' % (year, MONTH_VALUE_DICT[month], day)
hit_per_day[utcdate] += 1
try:
site_data = per_site[site]
except KeyError:
site_data = per_site[site] = action(threshold)
site_data.accumulate(match, url_match, utcdate)
os.chdir(args.out)
with open('index.html', 'w') as out:
out.write('<html><head><title>Stats</title><style>th, td { border: solid 1px #000; } th { text-align: center; } td { text-align: right; } td.text { text-align: left; } table { border-collapse: collapse; } .problem { background-color: #f00; color: white; } .warning { background-color: #f80; color: white; } </style></head><body><h1>Overall</h1><h2>Hits per day</h2><table><tr><th>date</th><th>hits</th></tr>')
for date, hit in sorted(hit_per_day.iteritems(), key=ITEMGETTER0):
out.write('<tr><td>%s</td><td>%s</td></tr>' % (date, hit))
out.write('</table>')
for site_id, data in per_site.iteritems():
out.write('<h1>Site: %s</h1>' % site_id)
if matplotlib is not None:
daily_data = data.getApdexData()
date_list = [x[0] for x in daily_data]
apdex_plot = pyplot.subplot(2, 1, 1)
apdex_plot.xaxis.set_major_locator(all_days)
apdex_plot.xaxis.set_major_formatter(day_formatter)
pyplot.title('Apdex')
pyplot.ylabel('%')
pyplot.plot(date_list, [x[1] for x in daily_data], '-')
pyplot.plot(date_list, [x[1] for x in daily_data], '.')
# response_time_plot = pyplot.subplot(3, 1, 2)
# response_time_plot.xaxis.set_major_locator(all_days)
# response_time_plot.xaxis.set_major_formatter(day_formatter)
# pyplot.title('Response time')
# pyplot.ylabel('time (s)')
# pyplot.plot(date_list, [x[2] for x in daily_data], '-', label='Average')
# pyplot.plot(date_list, [x[3] for x in daily_data], '--', label='Max')
# pyplot.legend()
hit_plot = pyplot.subplot(2, 1, 2)
hit_plot.xaxis.set_major_locator(all_days)
hit_plot.xaxis.set_major_formatter(day_formatter)
pyplot.title('Hits')
pyplot.plot(date_list, [x[4] for x in daily_data], '-')
pyplot.plot(date_list, [x[4] for x in daily_data], '.')
plot_filename = site_id.strip('/').replace('/', '__') + '.png'
pyplot.subplots_adjust(hspace=.4)
pyplot.savefig(plot_filename)
pyplot.clf()
out.write('<img src="' + plot_filename + '" />')
out.write(data.asHTML())
out.write('</body></html>')
if __name__ == '__main__':
main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment