Commit 039da94e authored by Vincent Pelletier's avatar Vincent Pelletier

Add optional median computation.

For every measure, display the median in addition to the existing values
(score, average, max).
Optional, because it requires an amount of ram proportional to the number
of hits.
parent 6a4d6f5c
......@@ -91,6 +91,12 @@ FILE_OPENER_LIST = [
INPUT_ENCODING = 'ascii'
INPUT_ENCODING_ERROR_HANDLER = 'replace'
class _NullList(list):
@staticmethod
def append(_):
pass
NULL_LIST = _NullList()
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
......@@ -260,9 +266,11 @@ class APDEXStats:
'duration_total',
'duration_max',
'getDuration',
'duration_list',
'enable_median',
)
def __init__(self, threshold, getDuration):
def __init__(self, threshold, getDuration, enable_median):
threshold *= US_PER_S
self.threshold = threshold
self.threshold4 = threshold * APDEX_TOLERATING_COEF
......@@ -272,6 +280,12 @@ class APDEXStats:
self.duration_total = 0
self.duration_max = 0
self.getDuration = getDuration
self.enable_median = enable_median
self.duration_list = (
[]
if enable_median else
NULL_LIST
)
def accumulate(self, match):
duration = self.getDuration(match)
......@@ -283,12 +297,15 @@ class APDEXStats:
elif duration <= self.threshold4:
self.apdex_4 += 1
self.hit += 1
self.duration_list.append(duration)
def accumulateFrom(self, other):
for attribute in ('apdex_1', 'apdex_4', 'hit', 'duration_total'):
setattr(self, attribute,
getattr(self, attribute) + getattr(other, attribute))
self.duration_max = max(self.duration_max, other.duration_max)
if self.enable_median:
self.duration_list = itertools.chain(self.duration_list, other.duration_list)
def getApdex(self):
if self.hit:
......@@ -304,9 +321,16 @@ class APDEXStats:
return float(self.duration_max) / US_PER_S
@staticmethod
def asHTMLHeader(overall=False):
def asHTMLHeader(overall=False, enable_median=False):
return (
'<th>apdex</th><th>hits</th><th>avg (s)</th>'
'<th>apdex</th>'
'<th>hits</th>'
'<th>avg (s)</th>' +
(
'<th>med (s)</th>'
if enable_median else
''
) +
'<th' + (' class="overall_right"' if overall else '') + '>max (s)</th>'
)
......@@ -328,15 +352,35 @@ class APDEXStats:
extra_right_class = 'overall_right'
else:
extra_right_class = ''
if self.enable_median:
duration_list = sorted(self.duration_list)
if duration_list:
duration_list_len = len(duration_list)
half_duration_list_len = duration_list_len >> 1
if duration_list_len & 1:
median = duration_list[half_duration_list_len]
else:
median = (
duration_list[half_duration_list_len - 1] +
duration_list[half_duration_list_len]
) / 2
median /= US_PER_S
else:
median = 0
median_string = f'<td class="{getClassForDuration(median, threshold)} {extra_class}">{median:.2f}</td>'
else:
median_string = ''
return (
f'<td style="{apdex_style}" class="{extra_class} group_left">{round(apdex * 100)}%</td>'
f'<td class="{extra_class}">{hit}</td>'
f'<td class="{getClassForDuration(average, threshold)} {extra_class}">{average:.2f}</td>'
f'<td class="{getClassForDuration(average, threshold)} {extra_class}">{average:.2f}</td>' +
median_string +
f'<td class="{getClassForDuration(maximum, threshold)} {extra_class} group_right {extra_right_class}">{maximum:.2f}</td>'
)
_IGNORE_IN_STATE = (
'getDuration',
'duration_list',
)
@classmethod
......@@ -344,9 +388,10 @@ class APDEXStats:
result = cls(
threshold=0,
getDuration=getDuration,
enable_median=False,
)
for key in self.__slots__:
if key in self._IGNORE_IN_STATE:
for key in cls.__slots__:
if key in cls._IGNORE_IN_STATE:
continue
try:
value = state[key]
......@@ -377,6 +422,7 @@ class GenericSiteStats:
suffix,
error_detail=False,
user_agent_detail=False,
enable_median=False,
# Non-generic parameters
**_
):
......@@ -387,10 +433,11 @@ class GenericSiteStats:
if error_detail:
# status -> url -> referrer -> count
self.error_url_count = defaultdict(partial(defaultdict, Counter))
self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration, enable_median))
self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration, enable_median))
self.user_agent_detail = user_agent_detail
self.user_agent_counter = Counter()
self.enable_median = enable_median
def rescale(self, convert, getDuration):
for status, date_dict in self.status.items():
......@@ -398,7 +445,7 @@ class GenericSiteStats:
for value_date, status_count in date_dict.items():
new_date_dict[convert(value_date)] += status_count
self.status[status] = new_date_dict
new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration))
new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration, self.enable_median))
for value_date, data in self.apdex.items():
new_apdex[convert(value_date)].accumulateFrom(data)
self.apdex = new_apdex
......@@ -431,15 +478,15 @@ class GenericSiteStats:
): # pylint: disable=unused-argument
result = []
append = result.append
apdex = APDEXStats(self.threshold, None)
apdex = APDEXStats(self.threshold, None, self.enable_median)
for data in self.apdex.values():
apdex.accumulateFrom(data)
append('<h2>Overall</h2><table class="stats"><tr>')
append(APDEXStats.asHTMLHeader())
append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('</tr><tr>')
append(apdex.asHTML(self.threshold))
append('</tr></table><h2>Hottest pages</h2><table class="stats"><tr>')
append(APDEXStats.asHTMLHeader())
append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('<th>url</th></tr>')
for url, data in sorted(self.url_apdex.items(), key=lambda x: x[1].getAverage() * x[1].hit,
reverse=True)[:n_hottest_pages]:
......@@ -521,6 +568,8 @@ class GenericSiteStats:
suffix=suffix,
error_detail=error_detail,
user_agent_detail=state.get('user_agent_detail', True),
# json format does not support median, due to how large they can get
enable_median=False,
)
if error_detail:
error_url_count = result.error_url_count
......@@ -586,6 +635,7 @@ class ERP5SiteStats(GenericSiteStats):
suffix,
error_detail=False,
user_agent_detail=False,
enable_median=False,
erp5_expand_other=False,
):
super().__init__(
......@@ -594,6 +644,7 @@ class ERP5SiteStats(GenericSiteStats):
suffix,
error_detail=error_detail,
user_agent_detail=user_agent_detail,
enable_median=enable_median,
)
self.expand_other = erp5_expand_other
......@@ -602,35 +653,54 @@ class ERP5SiteStats(GenericSiteStats):
# - module id (string)
# - is document (bool)
# - date (string)
self.module = defaultdict(partial(defaultdict, partial(
defaultdict, partial(APDEXStats, threshold, getDuration))))
self.module = defaultdict(
partial(
defaultdict,
partial(
defaultdict,
partial(APDEXStats, threshold, getDuration, enable_median),
),
),
)
# Key levels:
# - id (string)
# => 'other' only if expand_other == False
# - date (string)
self.no_module = defaultdict(partial(
defaultdict, partial(APDEXStats, threshold, getDuration)))
self.no_module = defaultdict(
partial(
defaultdict,
partial(APDEXStats, threshold, getDuration, enable_median),
),
)
self.site_search = defaultdict(partial(APDEXStats, threshold, getDuration))
self.site_search = defaultdict(
partial(APDEXStats, threshold, getDuration, enable_median),
)
def rescale(self, convert, getDuration):
super().rescale(convert, getDuration)
threshold = self.threshold
for document_dict in self.module.values():
for is_document, date_dict in document_dict.items():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration))
new_date_dict = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in date_dict.items():
new_date_dict[convert(value_date)].accumulateFrom(data)
document_dict[is_document] = new_date_dict
for id_, date_dict in self.no_module.items():
new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration))
new_date_dict = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in date_dict.items():
new_date_dict[convert(value_date)].accumulateFrom(data)
self.no_module[id_] = new_date_dict
attribute = defaultdict(partial(APDEXStats, threshold, getDuration))
attribute = defaultdict(
partial(APDEXStats, threshold, getDuration, self.enable_median),
)
for value_date, data in self.site_search.items():
attribute[convert(value_date)].accumulateFrom(data)
self.site_search = attribute
......@@ -662,13 +732,25 @@ class ERP5SiteStats(GenericSiteStats):
append('<h2>Stats per module</h2><table class="stats stats_erp5"><tr>'
'<th rowspan="2" colspan="3">module</th>'
'<th colspan="4" class="overall_right">overall</th>')
module_document_overall = defaultdict(partial(APDEXStats, self.threshold,
None))
filtered_module = defaultdict(partial(defaultdict, partial(
defaultdict, partial(APDEXStats, self.threshold, None))))
other_overall = APDEXStats(self.threshold, None)
filtered_no_module = defaultdict(partial(
defaultdict, partial(APDEXStats, self.threshold, None)))
module_document_overall = defaultdict(
partial(APDEXStats, self.threshold, None, self.enable_median),
)
filtered_module = defaultdict(
partial(
defaultdict,
partial(
defaultdict,
partial(APDEXStats, self.threshold, None, self.enable_median),
),
),
)
other_overall = APDEXStats(self.threshold, None, self.enable_median)
filtered_no_module = defaultdict(
partial(
defaultdict,
partial(APDEXStats, self.threshold, None, self.enable_median),
),
)
column_set = set()
for key, data_dict in self.no_module.items():
filtered_id_dict = filtered_no_module[key]
......@@ -676,8 +758,9 @@ class ERP5SiteStats(GenericSiteStats):
filtered_id_dict[stat_filter(value_date)].accumulateFrom(value)
other_overall.accumulateFrom(value)
column_set.update(filtered_id_dict)
filtered_site_search = defaultdict(partial(APDEXStats, self.threshold,
None))
filtered_site_search = defaultdict(
partial(APDEXStats, self.threshold, None, self.enable_median),
)
for value_date, value in self.site_search.items():
filtered_site_search[stat_filter(value_date)].accumulateFrom(value)
column_set.update(filtered_site_search)
......@@ -695,10 +778,13 @@ class ERP5SiteStats(GenericSiteStats):
append(f'<th colspan="4">{column}</th>')
append('</tr><tr>')
for i in range(len(column_list) + 1):
append(APDEXStats.asHTMLHeader(i == 0))
append(APDEXStats.asHTMLHeader(
overall=i == 0,
enable_median=self.enable_median,
))
append('</tr>')
def apdexAsColumns(data_dict):
data_total = APDEXStats(self.threshold, None)
data_total = APDEXStats(self.threshold, None, self.enable_median)
for data in data_dict.values():
data_total.accumulateFrom(data)
append(data_total.asHTML(self.threshold, True))
......@@ -754,7 +840,7 @@ class ERP5SiteStats(GenericSiteStats):
append('</tr>')
append('</table><h2>Per-level overall</h2><table class="stats"><tr>'
'<th>level</th>')
append(APDEXStats.asHTMLHeader())
append(APDEXStats.asHTMLHeader(enable_median=self.enable_median))
append('</tr><tr><th>other</th>')
append(other_overall.asHTML(self.threshold))
append('</tr><tr><th>site search</th>')
......@@ -1169,6 +1255,7 @@ def asHTML(
('apdex threshold', f'{args.apdex:.2f}s'),
('period', args.period or (period + ' (auto)')),
('timezone', args.to_timezone or "(input's)"),
('median', ('enabled' if args.enable_median else 'disabled')),
):
out.write(f'<tr><th class="text">{caption}</th><td>{value}</td></tr>')
out.write(f'</table><h2>Hits per {period}</h2><table class="stats">'
......@@ -1253,8 +1340,8 @@ def asJSON(out, encoding, per_site, *_): # pylint: disable=unused-argument
json.dump([(x, y.asJSONState()) for x, y in per_site.items()], out)
format_generator = {
'html': (asHTML, 'utf-8'),
'json': (asJSON, 'ascii'),
'html': (asHTML, 'utf-8', True),
'json': (asJSON, 'ascii', False),
}
ZERO_TIMEDELTA = timedelta(0, 0)
......@@ -1335,6 +1422,9 @@ def main():
'Useful when migrating from one configuration/software package to '
'another while keeping results comparable even if the latter has a '
'much larger (and possibly non-configurable) total request timeout.')
parser.add_argument('--enable-median', action='store_true',
help='Enable median computation. Increases memory use. Forcibly '
'disabled when state files are used, either as input or output.')
group = parser.add_argument_group('generated content (all formats)')
group.add_argument('-a', '--apdex', default=1.0, type=float,
......@@ -1413,6 +1503,8 @@ def main():
else:
parser.error('Neither %D nor %T are present in logformat, apdex '
'cannot be computed.')
generator, out_encoding, enable_median = format_generator[args.format]
args.enable_median = enable_median = enable_median and args.enable_median and not args.state_file
if args.duration_cap:
def getDuration( # pylint: disable=function-redefined
match,
......@@ -1662,6 +1754,7 @@ def main():
error_detail=error_detail,
user_agent_detail=user_agent_detail,
erp5_expand_other=erp5_expand_other,
enable_median=enable_median,
)
try:
site_data.accumulate(match, url_match, hit_date)
......@@ -1674,7 +1767,6 @@ def main():
if show_progress:
print(lineno, file=sys.stderr)
end_parsing_time = time.time()
generator, out_encoding = format_generator[args.format]
if args.out == '-':
out = sys.stdout
out.reconfigure(encoding=out_encoding)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment