Commit b42f842a authored by Vincent Pelletier's avatar Vincent Pelletier

Do not unquote URLs.

There is no standard unquoted URL encoding (=charset), so do not unquote
to just handle ASCII. Fixes various UnicodeDecodeError failures, at the
cost of decreased report readability.
parent 094a4fd4
...@@ -33,7 +33,7 @@ from collections import defaultdict, Counter ...@@ -33,7 +33,7 @@ from collections import defaultdict, Counter
from datetime import datetime, timedelta, date, tzinfo from datetime import datetime, timedelta, date, tzinfo
from functools import partial from functools import partial
from operator import itemgetter from operator import itemgetter
from urllib import splittype, splithost, unquote from urllib import splittype, splithost
import argparse import argparse
import bz2 import bz2
import calendar import calendar
...@@ -107,8 +107,6 @@ if lzma is not None: ...@@ -107,8 +107,6 @@ if lzma is not None:
# to be urlencoded, so pure ascii. Are timestamps localised ? # to be urlencoded, so pure ascii. Are timestamps localised ?
INPUT_ENCODING = 'ascii' INPUT_ENCODING = 'ascii'
unquoteToHtml = lambda x: escape(unquote(x))
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar', MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1)) 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
...@@ -424,13 +422,12 @@ class GenericSiteStats(object): ...@@ -424,13 +422,12 @@ class GenericSiteStats(object):
reverse=True)[:N_SLOWEST]: reverse=True)[:N_SLOWEST]:
append('<tr>') append('<tr>')
append(data.asHTML(self.threshold)) append(data.asHTML(self.threshold))
append('<td class="text">%s</td></tr>' % unquoteToHtml(url)) append('<td class="text">%s</td></tr>' % escape(url))
append('</table>') append('</table>')
if self.user_agent_detail: if self.user_agent_detail:
append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>' append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>'
'<th>user agent</th></tr>') '<th>user agent</th></tr>')
for user_agent, hit in self.user_agent_counter.most_common(N_USER_AGENT): for user_agent, hit in self.user_agent_counter.most_common(N_USER_AGENT):
# XXX: s/escape/unquoteToHtml/ ?
append('<tr><td>%s</td><td class="text">%s</td></tr>' % (hit, escape(user_agent))) append('<tr><td>%s</td><td class="text">%s</td></tr>' % (hit, escape(user_agent)))
append('</table>') append('</table>')
column_set = set() column_set = set()
...@@ -487,8 +484,8 @@ class GenericSiteStats(object): ...@@ -487,8 +484,8 @@ class GenericSiteStats(object):
append('<td>%s</td><td class="text">%s</td>' append('<td>%s</td><td class="text">%s</td>'
'<td class="text">%s</td>' % ( '<td class="text">%s</td>' % (
getHitForUrl(referer_counter), getHitForUrl(referer_counter),
unquoteToHtml(url), escape(url),
'<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer)) '<br/>'.join('%i: %s' % (hit, escape(referer))
for referer, hit in referer_counter.most_common( for referer, hit in referer_counter.most_common(
N_REFERRER_PER_ERROR_URL)), N_REFERRER_PER_ERROR_URL)),
)) ))
...@@ -1071,7 +1068,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict, ...@@ -1071,7 +1068,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
key=lambda x: site_caption_dict[x[0]]))) key=lambda x: site_caption_dict[x[0]])))
html_site_caption_dict = {} html_site_caption_dict = {}
for i, (site_id, _) in site_list: for i, (site_id, _) in site_list:
html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id]) html_site_caption_dict[site_id] = escape(site_caption_dict[site_id])
if len(per_site) > 1: if len(per_site) > 1:
out.write('<h2>Index</h2><ol>') out.write('<h2>Index</h2><ol>')
for i, (site_id, _) in site_list: for i, (site_id, _) in site_list:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment