Implement automatic period selection.

d14743d9 · Vincent Pelletier · 2c0837f3 · d14743d9 · d14743d9
Commit d14743d9 authored Apr 07, 2013 by Vincent Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 88 additions and 6 deletions

TODO TODO +0 -1

apachedex/__init__.py apachedex/__init__.py +88 -5

No files found.
--- a/TODO
+++ b/TODO
 - use some templating system instead of hardcoded html strings
 - provide some form of raw data output, not just html
 - allow user to specify min & max dates
- automatically select period from log data ?
--- a/apachedex/__init__.py
+++ b/apachedex/__init__.py
@@ -66,6 +66,7 @@ N_REFERRER_PER_ERROR_URL = 5
 ITEMGETTER0 = itemgetter(0)
 ITEMGETTER1 = itemgetter(1)
 APDEX_TOLERATING_COEF = 4
+AUTO_PERIOD_COEF = 200

 def statusIsError(status):
  return status[0] > '3'
@@ -260,6 +261,18 @@ class GenericSiteStats(object):
    self.url_apdex = defaultdict(partial(APDEXStats, threshold, getDuration))
    self.apdex = defaultdict(partial(APDEXStats, threshold, getDuration))

+  def rescale(self, convert, getDuration):
+    self.getDuration = getDuration
+    for status, date_dict in self.status.iteritems():
+      new_date_dict = defaultdict(int)
+      for date, status_count in date_dict.iteritems():
+        new_date_dict[convert(date)] += status_count
+      self.status[status] = new_date_dict
+    new_apdex = defaultdict(partial(APDEXStats, self.threshold, getDuration))
+    for date, data in self.apdex.iteritems():
+      new_apdex[convert(date)].accumulateFrom(data)
+    self.apdex = new_apdex
+
  def accumulate(self, match, url_match, date):
    self.apdex[date].accumulate(match)
    if url_match is None:
@@ -381,6 +394,20 @@ class ERP5SiteStats(GenericSiteStats):
      defaultdict, partial(APDEXStats, threshold, getDuration))))
    self.no_module = defaultdict(partial(APDEXStats, threshold, getDuration))

+  def rescale(self, convert, getDuration):
+    super(ERP5SiteStats, self).rescale(convert, getDuration)
+    threshold = self.threshold
+    for document_dict in self.module.itervalues():
+      for is_document, date_dict in document_dict.iteritems():
+        new_date_dict = defaultdict(partial(APDEXStats, threshold, getDuration))
+        for date, data in date_dict.iteritems():
+          new_date_dict[convert(date)].accumulateFrom(data)
+        document_dict[is_document] = new_date_dict
+    new_no_module = defaultdict(partial(APDEXStats, threshold, getDuration))
+    for date, data in self.no_module.iteritems():
+      new_no_module[convert(date)].accumulateFrom(data)
+    self.no_module = new_no_module
+
  def accumulate(self, match, url_match, date):
    split = self.suffix(url_match.group('url')).split('?', 1)[0].split('/')
    if split and split[0].endswith('_module'):
@@ -527,6 +554,9 @@ def _weekStringAsQuarterString(timestamp):
  year, month, _ = timestamp.split('/')
  return '%s/%02i' % (year, int(month) / 3 * 3 + 1)

+def _roundWeek(dt):
+  return dt.replace(day=dt.day / 7 * 7 + 1)
+
 def _asDayString(timestamp):
  dt, _ = timestamp.split(' ')
  day, month, year = dt.split(':', 1)[0].split('/')
@@ -539,6 +569,9 @@ def _as6HourString(timestamp):
  return '%s/%02i/%s %02i' % (year, MONTH_VALUE_DICT[month], day,
    int(hour) / 6 * 6)

+def _round6Hour(dt):
+  return dt.replace(hour=dt.hour / 6 * 6)
+
 def _hourAsWeekString(timestamp):
  dt = datetime.strptime(timestamp, '%Y/%m/%d %H')
  return (dt - timedelta(dt.weekday())).date().strftime('%Y/%m/%d')
@@ -559,6 +592,8 @@ def _asHourString(timestamp):
 #   datetime.datetime instance
 # - period during which a placeholder point will be added if there is no data
 #   point
+# - round a datetime.datetime instance so once represented using given format
+#   string it is a valid graph-granularity date for period
 period_parser = {
  'year': (
    _asMonthString,
@@ -567,6 +602,7 @@ period_parser = {
    '%Y/%m',
    # Longest month: 31 days
    timedelta(31),
+    lambda x: x,
  ),
  'quarter': (
    _asWeekString,
@@ -576,6 +612,7 @@ period_parser = {
    '7 days',
    '%Y/%m/%d',
    timedelta(7),
+    _roundWeek,
  ),
  'month': (
    _asDayString,
@@ -584,6 +621,7 @@ period_parser = {
    '%Y/%m/%d',
    # Longest day: 24 hours + 1h DST (never more ?)
    timedelta(seconds=3600 * 25),
+    lambda x: x,
  ),
  'week': (
    _as6HourString,
@@ -591,6 +629,7 @@ period_parser = {
    '6 hours',
    '%Y/%m/%d %H',
    timedelta(seconds=3600 * 6),
+    _round6Hour,
  ),
  'day': (
    _asHourString,
@@ -599,6 +638,7 @@ period_parser = {
    '%Y/%m/%d %H',
    # Longest hour: 60 * 60 seconds + 1 leap second.
    timedelta(seconds=3601),
+    lambda x: x,
  ),
 }

@@ -623,8 +663,12 @@ def main():
      'Default: %(default).2fs')
  group.add_argument('-e', '--error-detail', action='store_true',
    help='Include detailed report (url & referers) for error statuses.')
-  group.add_argument('-p', '--period', default='day', choices=period_parser,
-      help='Periodicity of sampling buckets. Default: %(default)r')
+  group.add_argument('-p', '--period', choices=period_parser,
+      help='Periodicity of sampling buckets. Default: (decide from data). '
+      'Performance note: leaving out this parameter reduces parsing '
+      'performance, as each period increase requires re-dispatching already '
+      'processed data. To mitigate this, provide earliest and latest log '
+      'files before all others (ex: log0 log3 log1 log2).')
  group.add_argument('-s', '--stats', action='store_true',
    help='Enable parsing stats (time spent parsing input, time spent '
      'generating output, ...)')
@@ -690,8 +734,23 @@ def main():
    assert not key, key
  matchline = re.compile(line_regex).match
  matchrequest = REQUEST_PATTERN.match
-  asDate, decimator, graph_period, date_format, placeholder_delta = \
-    period_parser[args.period]
+  if args.period is None:
+    next_period_data = ((x, y[4] * AUTO_PERIOD_COEF) for (x, y) in
+      sorted(period_parser.iteritems(), key=lambda x: x[1][4])).next
+    period, to_next_period = next_period_data()
+    earliest_date = latest_date = None
+    def getNextPeriod():
+      # datetime is slow (compared to string operations), but not many choices
+      return (datetime.strptime(earliest_date, date_format) + to_next_period
+        ).strftime(date_format)
+    def rescale(x):
+      result = round_date(datetime.strptime(x, old_date_format)).strftime(date_format)
+      return result
+  else:
+    to_next_period = None
+    period = args.period
+  asDate, decimator, graph_period, date_format, placeholder_delta, \
+    round_date = period_parser[period]
  site_list = args.path
  default_site = args.default
  if default_site is None:
@@ -757,6 +816,30 @@ def main():
        skipped_lines += 1
        continue
      date = asDate(match.group('timestamp'))
+      if to_next_period is not None:
+        if date > latest_date: # '' > None is True
+          latest_date = date
+        if date < earliest_date or earliest_date is None:
+          earliest_date = date
+          next_period = getNextPeriod()
+        if latest_date > next_period:
+          try:
+            while latest_date > next_period:
+              period, to_next_period = next_period_data()
+              next_period = getNextPeriod()
+          except StopIteration:
+            pass
+          print >> sys.stderr, 'Increasing period to', period, '...',
+          old_date_format = date_format
+          asDate, decimator, graph_period, date_format, placeholder_delta, \
+            round_date = period_parser[period]
+          period_increase_start = time.time()
+          print old_date_format, date_format
+          for site_data in per_site.itervalues():
+            site_data.rescale(rescale, getDuration)
+          print >> sys.stderr, 'done (%s)' % timedelta(seconds=time.time()
+            - period_increase_start)
+          date = asDate(match.group('timestamp'))
      try:
        site_data = per_site[site]
      except KeyError:
@@ -793,7 +876,7 @@ def main():
      '<table class="stats">')
    for caption, value in (
          ('apdex threshold', '%.2fs' % args.apdex),
-          ('period', args.period),
+          ('period', args.period or (period + ' (auto)')),
        ):
      out.write('<tr><th class="text">%s</th><td>%s</td></tr>' % (
        caption, value))