Commit b42f842a authored by Vincent Pelletier's avatar Vincent Pelletier

Do not unquote URLs.

There is no standard unquoted URL encoding (=charset), so do not unquote
to just handle ASCII. Fixes various UnicodeDecodeError failures, at the
cost of decreased report readability.
parent 094a4fd4
......@@ -33,7 +33,7 @@ from collections import defaultdict, Counter
from datetime import datetime, timedelta, date, tzinfo
from functools import partial
from operator import itemgetter
from urllib import splittype, splithost, unquote
from urllib import splittype, splithost
import argparse
import bz2
import calendar
......@@ -107,8 +107,6 @@ if lzma is not None:
# to be urlencoded, so pure ascii. Are timestamps localised ?
INPUT_ENCODING = 'ascii'
unquoteToHtml = lambda x: escape(unquote(x))
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
......@@ -424,13 +422,12 @@ class GenericSiteStats(object):
reverse=True)[:N_SLOWEST]:
append('<tr>')
append(data.asHTML(self.threshold))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url))
append('<td class="text">%s</td></tr>' % escape(url))
append('</table>')
if self.user_agent_detail:
append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>'
'<th>user agent</th></tr>')
for user_agent, hit in self.user_agent_counter.most_common(N_USER_AGENT):
# XXX: s/escape/unquoteToHtml/ ?
append('<tr><td>%s</td><td class="text">%s</td></tr>' % (hit, escape(user_agent)))
append('</table>')
column_set = set()
......@@ -487,8 +484,8 @@ class GenericSiteStats(object):
append('<td>%s</td><td class="text">%s</td>'
'<td class="text">%s</td>' % (
getHitForUrl(referer_counter),
unquoteToHtml(url),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer))
escape(url),
'<br/>'.join('%i: %s' % (hit, escape(referer))
for referer, hit in referer_counter.most_common(
N_REFERRER_PER_ERROR_URL)),
))
......@@ -1071,7 +1068,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
key=lambda x: site_caption_dict[x[0]])))
html_site_caption_dict = {}
for i, (site_id, _) in site_list:
html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id])
html_site_caption_dict[site_id] = escape(site_caption_dict[site_id])
if len(per_site) > 1:
out.write('<h2>Index</h2><ol>')
for i, (site_id, _) in site_list:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment