Toward Python3: unicode literals

Costs 30% performance on pypy.

Toward Python3: unicode literals
Costs 30% performance on pypy.
4a9737a9 · Vincent Pelletier · b068f82d · 4a9737a9
Commit 4a9737a9 authored Apr 14, 2013 by Vincent Pelletier
Show whitespace changes
Inline Side-by-side

Showing with 57 additions and 30 deletions

apachedex/__init__.py apachedex/__init__.py +57 -30

No files found.
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -26,7 +26,8 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 #
 ##############################################################################
-from __future__ import print_function, division, absolute_import
+from __future__ import print_function, division, absolute_import, \
+  unicode_literals
 from cgi import escape
 from collections import defaultdict, Counter
 from datetime import datetime, timedelta, date
@@ -37,6 +38,7 @@ import argparse
 import bz2
 import calendar
 import codecs
+import functools
 import gzip
 import httplib
 import itertools
@@ -54,18 +56,58 @@ import traceback
 def getResource(name, encoding='utf-8'):
  return pkgutil.get_data(__name__, name).decode(encoding)

+def _wrapOpen(func):
+  @functools.wraps(func)
+  def wrapper(*args, **kw):
+    encoding = kw.pop('encoding', None)
+    info = codecs.lookup(encoding)
+    errors = kw.pop('errors', 'strict')
+    file_object = func(*args, **kw)
+    if encoding is None:
+      return file_object
+    srw = codecs.StreamReaderWriter(
+      file_object,
+      info.streamreader,
+      info.streamwriter,
+      errors,
+    )
+    srw.encoding = encoding
+    return srw
+  return wrapper
+
+lzma = None
+gzip_open = gzip.open
+if sys.version_info >= (3, 3):
+  import lzma
+  bz2_open = bz2.open
+  _read_mode = 'rt'
+else:
+  open = codecs.open
+  gzip_open = _wrapOpen(gzip_open)
+  bz2_open = _wrapOpen(bz2.BZ2File)
+  _read_mode = 'r'
+
 FILE_OPENER_LIST = [
-  (gzip.open, IOError),
-  (bz2.BZ2File, IOError),
+  (gzip_open, IOError),
+  (bz2_open, IOError),
 ]
-
-try:
+if lzma is None:
+  try:
    from backports import lzma
-except ImportError:
+  except ImportError:
    pass
-else:
+if lzma is not None:
  FILE_OPENER_LIST.append((lzma.open, lzma.LZMAError))

+# XXX: what encoding ? apache doesn't document one, but requests are supposed
+# to be urlencoded, so pure ascii. Are timestamps localised ?
+INPUT_ENCODING = 'ascii'
+
+if sys.version_info < (3, ):
+  unquoteToHtml = lambda x: escape(unquote(x.encode('ascii')).decode('utf-8'))
+else:
+  unquoteToHtml = lambda x: escape(unquote(x))
+
 MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
  'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))

@@ -351,7 +393,7 @@ class GenericSiteStats(object):
        reverse=True)[:N_SLOWEST]:
      append('<tr>')
      append(data.asHTML(self.threshold))
-      append('<td class="text">%s</td></tr>' % unquoteToHtml(url, encoding))
+      append('<td class="text">%s</td></tr>' % unquoteToHtml(url))
    append('</table>')
    append('<h2>User agents</h2><table class="stats"><tr><th>hits</th>'
      '<th>user agent</th></tr>')
@@ -413,8 +455,8 @@ class GenericSiteStats(object):
          append('<td>%s</td><td class="text">%s</td>'
            '<td class="text">%s</td>' % (
            getHitForUrl(referer_counter),
-            unquoteToHtml(url, encoding),
-            '<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer, encoding))
+            unquoteToHtml(url),
+            '<br/>'.join('%i: %s' % (hit, unquoteToHtml(referer))
              for referer, hit in referer_counter.most_common(
                N_REFERRER_PER_ERROR_URL)),
          ))
@@ -931,9 +973,6 @@ period_parser = {
  ),
 }

-unquoteToHtml = lambda x, encoding: escape(unquote(x).decode(encoding,
-  'replace'))
-
 apdex_y_scale_dict = {
  'linear': None,
  'log': 'log100To0',
@@ -980,8 +1019,7 @@ def asHTML(out, encoding, per_site, args, default_site, period_parameter_dict,
    key=lambda x: site_caption_dict[x[0]])))
  html_site_caption_dict = {}
  for i, (site_id, _) in site_list:
-    html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id],
-      encoding)
+    html_site_caption_dict[site_id] = unquoteToHtml(site_caption_dict[site_id])
  if len(per_site) > 1:
    out.write('<h2>Index</h2><ol>')
    for i, (site_id, _) in site_list:
@@ -1084,17 +1122,6 @@ format_generator = {
  'json': (asJSON, 'ascii'),
 }

-# XXX: monkey-patching json module to emit strings instead of unicode objects.
-# Because strings are faster, (30% overall performance hit moving to unicode
-# objects), and only ASCII is expected (urlencoded is ASCII).
-# Subclassing JSONDecoder is not enough as object parser uses scanstring
-# directly.
-original_scanstring = json.decoder.scanstring
-def _scanstring(*args, **kw):
-  string, end = original_scanstring(*args, **kw)
-  return string.encode('ascii'), end
-json.decoder.scanstring = _scanstring
-
 def main():
  parser = ShlexArgumentParser(description='Compute Apdex out of '
    'apache-style log files', fromfile_prefix_chars='@')
@@ -1246,7 +1273,7 @@ def main():
    if state_file_name == '-':
      state_file = sys.stdin
    else:
-      state_file = open(state_file_name)
+      state_file = open(state_file_name, encoding='ascii')
    with state_file:
      load_start = time.time()
      state = json.load(state_file)
@@ -1289,7 +1316,7 @@ def main():
      logfile = sys.stdin
    else:
      for opener, exc in FILE_OPENER_LIST:
-        logfile = opener(filename)
+        logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING)
        try:
          logfile.readline()
        except exc:
@@ -1298,7 +1325,7 @@ def main():
          logfile.seek(0)
          break
      else:
-        logfile = open(filename)
+        logfile = open(filename, _read_mode, encoding=INPUT_ENCODING)
    lineno = 0
    for lineno, line in enumerate(logfile, 1):
      if show_progress and lineno % 5000 == 0: