Use regexes for URL bases instead of startswith().

Hurts parsing performance by a few percents, sadly, but I don't see any other way.

Use regexes for URL bases instead of startswith().
Hurts parsing performance by a few percents, sadly, but I don't see any other way.
3ebcfa46 · Vincent Pelletier · 75507403 · 3ebcfa46
Commit 3ebcfa46 authored Apr 06, 2013 by Vincent Pelletier
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 26 deletions

apachedex/__init__.py apachedex/__init__.py +23 -26

No files found.
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -249,9 +249,9 @@ class APDEXStats(object):
    return float(self.duration_max) / US_PER_S

 class GenericSiteStats(object):
-  def __init__(self, threshold, getDuration, prefix=1, error_detail=False):
+  def __init__(self, threshold, getDuration, suffix, error_detail=False):
    self.threshold = threshold
-    self.prefix = prefix
+    self.suffix = suffix
    self.error_detail = error_detail
    self.getDuration = getDuration
    self.status = defaultdict(partial(defaultdict, int))
@@ -371,8 +371,8 @@ class ERP5SiteStats(GenericSiteStats):
  - If a line belongs to a module and has at least 2 slashes after module,
    count line as belonging to a document of that module
  """
-  def __init__(self, threshold, getDuration, prefix=1, error_detail=False):
-    super(ERP5SiteStats, self).__init__(threshold, getDuration, prefix=prefix,
+  def __init__(self, threshold, getDuration, suffix, error_detail=False):
+    super(ERP5SiteStats, self).__init__(threshold, getDuration, suffix,
      error_detail=error_detail)
    # Key levels:
    # - module id (string)
@@ -383,12 +383,10 @@ class ERP5SiteStats(GenericSiteStats):
    self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration))

  def accumulate(self, match, url_match, date):
-    prefix = self.prefix
-    split = url_match.group('url').split('?', 1)[0].split('/')[1 + prefix:]
-    if split:
-      module = split[0]
-      if module.endswith('_module'):
+    split = self.suffix(url_match.group('url')).split('?', 1)[0].split('/')
+    if split and split[0].endswith('_module'):
      super(ERP5SiteStats, self).accumulate(match, url_match, date)
+      module = split[0]
      self.module[module][
        len(split) > 1 and (split[1] != 'view' and '_view' not in split[1])
      ][date].accumulate(match)
@@ -496,13 +494,12 @@ class AggregateSiteUrl(argparse.Action):
    action = self.__argument_to_aggregator[option_string]
    dest = getattr(namespace, self.dest)
    for value in values:
+      match = re.compile(value).match
      if action is not None:
-        if value[-1:] == '/':
-          offset = -1
-        else:
-          offset = 0
-        action = partial(action, prefix=value.count('/') + offset)
-      dest.append((value, action))
+        match_suffix = re.compile(value + '(?P<suffix>.*)').match
+        action = partial(action,
+          suffix=lambda x: match_suffix(x).group('suffix'))
+      dest.append((value, match, action))

 def _asMonthString(timestamp):
  dt, tz = timestamp.split(' ')
@@ -592,8 +589,9 @@ def main():
      help='Embed js files instead of linking to them.')

  group = parser.add_argument_group('site matching', 'Earlier arguments take '
-    'precedence. For example: --skip-base /foo/bar --base /foo generates '
-    'stats for /foo, excluding /foo/bar.')
+    'precedence. For example: --skip-base "/foo/bar(/|$|\\?)" '
+    '--base "/foo(/|$|\\?)" generates stats for /foo, excluding /foo/bar. '
+    'Arguments (except for -d/--default) are interpreted as Python regexes.')
  group.add_argument('-d', '--default',
    help='Caption for lines matching no prefix, or skip them if not provided.')
  group.add_argument('--base', dest='path', default=[], nargs='+',
@@ -655,7 +653,7 @@ def main():
        'specified, nothing to do.'
      sys.exit(1)
  else:
-    default_action = partial(GenericSiteStats, prefix=0)
+    default_action = partial(GenericSiteStats, suffix=lambda x: x)
  infile_list = args.logfile
  quiet = args.quiet
  threshold = args.apdex
@@ -702,9 +700,8 @@ def main():
      url = url_match.group('url')
      if url.startswith('http'):
        url = splithost(splittype(url)[1])[1]
-      startswith = url.startswith
-      for site, action in site_list:
-        if startswith(site):
+      for site, prefix_match, action in site_list:
+        if prefix_match(url) is not None:
          break
      else:
        site = default_site