Initial version of utility able to check caching

configuration of websites with help of Varnish. This script use wget to crawl the website and inspect all Status code, then headers returns by server. This script parse also varnish logs to know if client's query goes to the backend or not. This utility is still under developement, it's released under beta version for testing purpose only. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils/erp5.utils.web_checker/@37439 20353a03-c40f-0410-a6d1-a30d3c3de9de

Initial version of utility able to check caching
configuration of websites with help of Varnish. This script use wget to crawl the website and inspect all Status code, then headers returns by server. This script parse also varnish logs to know if client's query goes to the backend or not. This utility is still under developement, it's released under beta version for testing purpose only. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils/erp5.utils.web_checker/@37439 20353a03-c40f-0410-a6d1-a30d3c3de9de
fe9b836f · Nicolas Delaby · Arnaud Fontaine · 34ae978b · fe9b836f · fe9b836f
Commit fe9b836f authored 14 years ago by Nicolas Delaby Committed by Arnaud Fontaine 13 years ago
Showing with 418 additions and 0 deletions

erp5/util/CHANGES.webchecker.txt erp5/util/CHANGES.webchecker.txt +3 -0

erp5/util/README.webchecker.txt erp5/util/README.webchecker.txt +34 -0

erp5/util/webchecker/__init__.py erp5/util/webchecker/__init__.py +381 -0

No files found.
--- a/erp5/util/CHANGES.webchecker.txt
+++ b/erp5/util/CHANGES.webchecker.txt
+2010-08-03 Nicolas Delaby
+-------------------------
+First release
\ No newline at end of file
--- a/erp5/util/README.webchecker.txt
+++ b/erp5/util/README.webchecker.txt
+Utility able to call wget and varnishlog to extract Headers and return all failures
+according expected caching policy.
+This utility is configurable through a dictionnary like
+   configuration = {'url': 'http://www.example.com',
+                   'working_directory': '/home/me/tmp/crawled_content',
+                   'varnishlog_binary_path': 'varnishlog',
+                   'header_list': {'Last-Modified': True,
+                                   'Cache-Control': ('max-age=300', 'max-age=3600',),
+                                   'Vary' : ('Accept-Language, Cookie', 'Accept-Language,Cookie'),
+                                   'Expires': True,
+                                  },
+                   'email_address': 'me@example.com',
+                   'smtp_host': 'localhost',
+                   'debug_level': 'debug',
+                  }
+url : website to check
+working_directory : fetched data will be downloaded
+varnishlog_binary_path :  path to varnishlog
+header_list : Key == Header id.
+              value: if equals to True, it means that header needs to be present in RESPONSE
+                     if it is a tuple, the Header value must sastify at least one of the proposed values
+email_address : email address to send result
+smtp_host : smtp host to use
+debug_level : log level of this utility (debug =>very verbose,
+                                         info=>normal,
+                                         warning=>nothing)
+This utility requires wget => 1.12
+And a callable varnishlog.
+The utility must be run on same server where varnish is running.
+web_checker reads varnishlogs to detect if a Query goes to the backend.
--- a/erp5/util/webchecker/__init__.py
+++ b/erp5/util/webchecker/__init__.py
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas@nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+import os
+import shutil
+import sys
+import re
+import shlex
+from subprocess import Popen, PIPE, STDOUT
+import logging
+import tempfile
+from datetime import datetime
+import threading
+import signal
+_MARKER = []
+class ProcessTimeoutException(Exception):
+  pass
+def alarm_handler(signum, frame):
+  raise ProcessTimeoutException
+class HTTPCacheCheckerTestSuite(object):
+  URL_CODE = 'url'
+  STATUS_CODE = 'status'
+  FILE_PATH_CODE = 'path'
+  OTHER_CODE = 'other'
+  LOG_LEVEL_DICT = {'debug': logging.DEBUG,
+                    'info': logging.INFO,
+                    'warning': logging.WARNING,
+                    'error': logging.ERROR,
+                    'critical': logging.CRITICAL}
+  url_search_in_wget_regex = re.compile('^--\d{4}.*--\s+(?P<%s>.*)$' %\
+                                                                      URL_CODE)
+  status_search_in_wget_regex = re.compile('^HTTP request sent, awaiting'\
+                                 'response\.\.\. (?P<%s>\d+).+$' % STATUS_CODE)
+  file_save_search_regex = re.compile("^Saving to: `(?P<%s>.*)'" %\
+                                                                FILE_PATH_CODE)
+  x_cache_header_search_regex = re.compile('X-Cache:\s(\S+)\s?$', re.MULTILINE)
+  x_varnish_header_search_regex = re.compile('X-Varnish:\s(\d+)', re.MULTILINE)
+  generic_header_search_regex = '%s:\s(.*)\s$'
+  ACCEPTABLE_STATUS_LIST = ('200', '304',)
+  def __init__(self, root_url, working_directory, varnishlog_binary_path,
+               header_list, email_address, smtp_host, debug_level):
+    """
+      root_url : website to check
+      working_directory : where fetched data will be downloaded
+      varnishlog_binary_path :  path to varnishlog
+      header_list : Key == Header id.
+                    value: if equals True means header
+                             needs to be present in RESPONSE
+                           if this is tuple, the Header value must sastify
+                             at least one of the proposed values
+      email_address : email address to send result
+      smtp_host : smtp host to use
+      debug_level : log level of this utility (debug =>very verbose,
+                                               info=>normal,
+                                               warning=> nothing)
+    """
+    self.root_url = root_url
+    self.working_directory = working_directory
+    self.varnishlog_binary_path = varnishlog_binary_path
+    self.header_list = header_list
+    self.email_address = email_address
+    self.smtp_host = smtp_host
+    level = self.LOG_LEVEL_DICT.get(debug_level, logging.INFO)
+    logging.basicConfig(filename='erp5_web_checker.log', level=level)
+    self.report_list = []
+    self._timeout = 30
+  def _initFolder(self):
+    """Delete and create workgin directory
+    """
+    if os.path.isdir(self.working_directory):
+      logging.debug('Re-creating folder:%r' % self.working_directory)
+      shutil.rmtree(self.working_directory)
+      os.mkdir(self.working_directory)
+    if not os.path.isdir(self.working_directory):
+      logging.debug('Creating folder:%r' % self.working_directory)
+      os.mkdir(self.working_directory)
+  def _runVarnishLog(self):
+    """Run varnishlog and listen comunications
+    """
+    logging.info('Start varnishlog process')
+    command_string = '%s -d' % (self.varnishlog_binary_path)#, filename)
+    command_list = shlex.split(command_string)
+    process = Popen(command_list, stdin=PIPE, stdout=PIPE)
+    return process
+  def _readVarnishLog(self, process):
+    """Add timeout support
+    """
+    logging.info('Reading varnishlog with timeout:%r' % self._timeout)
+    def _kill_process(pid):
+      # support for python 2.5 and under.
+      # Shall use process.terminate() for python2.6 and above
+      os.kill(pid, signal.SIGTERM)
+    watcher = threading.Timer(self._timeout, _kill_process, args=(process.pid,))
+    watcher.start()
+    varnishlog_data, _ = process.communicate()
+    watcher.cancel()
+    return varnishlog_data
+  def _readVarnishLogAndGetIsBackendTouched(self, varnihslog_data,
+                                                     x_varnish_reference_list):
+    """Utility to parse such string:
+      14 ReqStart     c 127.0.0.1 58470 385643239
+      14 RxRequest    c GET
+      14 RxURL        c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
+      14 RxProtocol   c HTTP/1.1
+      14 RxHeader     c Host: 127.0.0.1:6081
+      14 RxHeader     c User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
+      14 RxHeader     c Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
+      14 RxHeader     c Accept-Language: fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3
+      14 RxHeader     c Accept-Encoding: gzip,deflate
+      14 RxHeader     c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
+      14 RxHeader     c Referer: http://www.example.com/
+      14 RxHeader     c If-Modified-Since: Thu, 25 Feb 2010 16:22:23 GMT
+      14 RxHeader     c Via: 1.1 www.example.com
+      14 RxHeader     c X-Forwarded-For: 82.226.226.226
+      14 RxHeader     c X-Forwarded-Host: www.example.com
+      14 RxHeader     c X-Forwarded-Server: www.example.com
+      14 RxHeader     c Connection: Keep-Alive
+      14 VCL_call     c recv
+      14 VCL_return   c lookup
+      14 VCL_call     c hash
+      14 VCL_return   c hash
+      14 VCL_call     c miss
+      14 VCL_return   c fetch
+      14 Backend      c 15 default default
+      15 TxRequest    b GET
+      15 TxURL        b /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
+      15 TxProtocol   b HTTP/1.1
+      15 TxHeader     b Host: 127.0.0.1:6081
+      15 TxHeader     b User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
+      15 TxHeader     b Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
+      15 TxHeader     b Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
+      15 TxHeader     b Referer: http://www.example.com/
+      15 TxHeader     b Via: 1.1 www.example.com
+      15 TxHeader     b X-Forwarded-For: 82.226.226.226
+      15 TxHeader     b X-Forwarded-Host: www.example.com
+      15 TxHeader     b X-Forwarded-Server: www.example.com
+      15 TxHeader     b X-Varnish: 385643239
+      15 TxHeader     b X-Forwarded-For: 127.0.0.1
+      16 SessionOpen  c 127.0.0.1 58471 :6081
+      16 ReqStart     c 127.0.0.1 58471 385643240
+      16 RxRequest    c GET
+      16 RxURL        c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/erp5_web_example_layout.css
+      16 RxProtocol   c HTTP/1.1
+      16 RxHeader     c Host: 127.0.0.1:6081
+      [...]
+    This script is able to know if the request goes to the backend or not according ID of request
+    ie: 385643239
+    if we are able to read such string "TxHeader     b X-Varnish: 385643239"
+    means varnish send a query to the backend.
+    """
+    for x_varnish_reference in x_varnish_reference_list:
+      backend_is_touched_regex = re.compile('TxHeader\s+b\sX-Varnish:\s%s' %\
+                                                        x_varnish_reference[0])
+      match_object = backend_is_touched_regex.search(varnihslog_data)
+      backend_touched = match_object is not None
+      logging.debug('%r %r' % (backend_touched, x_varnish_reference,))
+      if backend_touched != x_varnish_reference[1]:
+        if backend_touched:
+          title = 'Error:Backend touched'
+        else:
+          title = 'Error Backend not touched'
+        message = '%s -> X-Varnish:%r' % (title, x_varnish_reference,)
+        self.report_list.append(message)
+  def _parseWgetLine(self, line):
+    """return tuple (code, value)
+    code: 'url' means this is requested url
+          'status' means we can read status code
+          'other' something we can discard
+    """
+    match_object = self.url_search_in_wget_regex.search(line)
+    if match_object is not None:
+      return self.URL_CODE, match_object.group(self.URL_CODE)
+    match_object = self.status_search_in_wget_regex.search(line)
+    if match_object is not None:
+      return self.STATUS_CODE, match_object.group(self.STATUS_CODE)
+    match_object = self.file_save_search_regex.search(line)
+    if match_object is not None:
+      return self.FILE_PATH_CODE, match_object.group(self.FILE_PATH_CODE)
+    return self.OTHER_CODE, line
+  def _runSpider(self):
+    """Run wget in working_directory with headers in result
+    """
+    wget_command_string = 'wget -r -nc --retry-connrefused --save-headers %s '\
+                                                                % self.root_url
+    logging.debug('wget command:%r' % wget_command_string)
+    wget_argument_list = shlex.split(wget_command_string)
+    wget_process = Popen(wget_argument_list, stdin=PIPE,
+                         stdout=PIPE, stderr=STDOUT,
+                         env={'LANG': 'en_EN'}, # Force output messages
+                                                # in English
+                         universal_newlines=True,
+                         cwd=self.working_directory) # working directory
+    stdout, stderr = wget_process.communicate()
+    return stdout
+  def _parseWgetLogs(self, wget_log_file, discarded_url_list=_MARKER):
+    """read wget logs and test Caching configuration
+    """
+    if discarded_url_list is _MARKER:
+      first_pass = True
+    else:
+      first_pass = False
+    x_varnish_reference_list = []
+    for line in  wget_log_file.splitlines():
+      logging.debug('wget output:%r' % line)
+      code, value = self._parseWgetLine(line)
+      if code == self.URL_CODE:
+        # This is the first Line by URL checked in wget stdout
+        url = value
+        logging.debug('url:%r' % url)
+        discarded = False
+        if not first_pass and url in discarded_url_list:
+          # URL already checked during first pass
+          logging.debug('%r Discarded' % url)
+          discarded = True
+      if discarded:
+        # keep reading wget process without doing anything
+        continue
+      if code == self.STATUS_CODE:
+        if value not in self.ACCEPTABLE_STATUS_LIST:
+          message = 'Page in error:%r status:%r' % (url, value)
+          if message not in self.report_list:
+            self.report_list.append(message)
+      if code == self.FILE_PATH_CODE:
+        # Here we check if Response was cached
+        file_path = os.path.sep.join((self.working_directory, value))
+        file_object = None
+        try:
+          file_object = open(file_path, 'r')
+        except IOError:
+          # This is probably a folder try with /index.html
+          index_file_path = file_path + os.path.sep + 'index.html'
+          try:
+            file_object = open(index_file_path, 'r')
+          except IOError:
+           # sometimes this is appended with .1
+           number_file_path = file_path + '.1'
+           try:
+             file_object = open(number_file_path, 'r')
+           except IOError:
+             logging.info('File not found for url:%r %r' %\
+                        (url, (file_path, index_file_path, number_file_path),))
+             continue
+        fetched_data = file_object.read()
+        file_object.close()
+        x_cache_header_match_object =\
+                          self.x_cache_header_search_regex.search(fetched_data)
+        if x_cache_header_match_object is None:
+          # This RESPONSE is not cached by Varnish
+          message = 'X-Cache header not found for %r' % url
+          self.report_list.append(message)
+        else:
+          # means X-Cache header is present in reponse
+          # Read the X-Varnish header to know if backend has been touched
+          x_varnish_match_object =\
+                        self.x_varnish_header_search_regex.search(fetched_data)
+          x_varnish_reference = x_varnish_match_object.group(1)
+          logging.info('x_varnish_reference:%r for url:%r' %\
+                                                    (x_varnish_reference, url))
+          hits = x_cache_header_match_object.group(1)
+          if hits.isdigit():
+            # This is a cached content with a positive hit value
+            # Check if request didn't goes to the backend
+            x_varnish_reference_list.append((x_varnish_reference, False, url))
+            # dot not check this url in second pass
+            logging.debug('will be discarded:%r' % url)
+            discarded_url_list.append(url)
+          else:
+            x_varnish_reference_list.append((x_varnish_reference, True, url))
+        for header, reference_value in self.header_list.iteritems():
+          re_compiled = re.compile(self.generic_header_search_regex % header,
+                                                                  re.MULTILINE)
+          match_object = re_compiled.search(fetched_data)
+          if match_object is None:
+            message = 'header:%r not found for %r' % (header, url)
+            self.report_list.append(message)
+          else:
+            read_value = match_object.group(1)
+            if reference_value is True and not read_value:
+              message = 'value of header:%r not found for %r' % (header, url)
+              self.report_list.append(message)
+            elif isinstance(reference_value, (tuple,list)):
+              if read_value not in reference_value:
+                message = 'value of header:%r does not match'\
+                          ' for %r (%r not in %r)' %\
+                                     (header, url, read_value, reference_value)
+                self.report_list.append(message)
+    return x_varnish_reference_list, discarded_url_list[:]
+  def start(self):
+    """Run test suite
+    return errors
+    """
+    logging.info('#'*52)
+    logging.info('date:%r' % (datetime.now().isoformat()))
+    logging.info('#'*52)
+    self._initFolder()
+    logging.info('First pass:%r' % self.root_url)
+    varnishlog_reading_process = self._runVarnishLog()
+    wget_log_file = self._runSpider()
+    varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
+    x_varnish_reference_list, discarded_url_list =\
+                                             self._parseWgetLogs(wget_log_file)
+    self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
+                                                      x_varnish_reference_list)
+    logging.info('End of First pass\n')
+    [logging.debug(discarded_url) for discarded_url in discarded_url_list]
+    self._initFolder()
+    logging.info('Second pass:%r' % self.root_url)
+    varnishlog_reading_process = self._runVarnishLog()
+    wget_log_file = self._runSpider()
+    varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
+    x_varnish_reference_list, discarded_url_list =\
+                                             self._parseWgetLogs(wget_log_file,
+                                         discarded_url_list=discarded_url_list)
+    self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
+                                                      x_varnish_reference_list)
+    logging.info('End of second pass\n')
+    if self.report_list:
+      report_message = 'Errors:\n\t%s' % '\n\t'.join(self.report_list)
+      signal = 'PROBLEM'
+    else:
+      report_message = 'No errors'
+      signal = 'OK'
+    if self.email_address is not None:
+      import smtplib
+      subject = 'HTTP Cache checker results for %s: %s' % (self.root_url,
+                                                                        signal)
+      message = 'Subject: %s\nFrom: %s\nTo: %s\n\n%s' %\
+              (subject, self.email_address, self.email_address, report_message)
+      server = smtplib.SMTP(self.smtp_host)
+      server.sendmail(self.email_address, self.email_address, message)
+      server.quit()
+      return 'Email sends to %s' % self.email_address
+    else:
+      return report_message
\ No newline at end of file