Commit fe9b836f authored by Nicolas Delaby's avatar Nicolas Delaby Committed by Arnaud Fontaine

Initial version of utility able to check caching

configuration of websites with help of Varnish.

This script use wget to crawl the website and inspect all Status code,
then headers returns by server.

This script parse also varnish logs to know if client's query goes to the backend
or not.

This utility is still under developement, it's released under beta version
for testing purpose only.


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils/erp5.utils.web_checker/@37439 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 34ae978b
2010-08-03 Nicolas Delaby
-------------------------
First release
\ No newline at end of file
Utility able to call wget and varnishlog to extract Headers and return all failures
according expected caching policy.
This utility is configurable through a dictionnary like
configuration = {'url': 'http://www.example.com',
'working_directory': '/home/me/tmp/crawled_content',
'varnishlog_binary_path': 'varnishlog',
'header_list': {'Last-Modified': True,
'Cache-Control': ('max-age=300', 'max-age=3600',),
'Vary' : ('Accept-Language, Cookie', 'Accept-Language,Cookie'),
'Expires': True,
},
'email_address': 'me@example.com',
'smtp_host': 'localhost',
'debug_level': 'debug',
}
url : website to check
working_directory : fetched data will be downloaded
varnishlog_binary_path : path to varnishlog
header_list : Key == Header id.
value: if equals to True, it means that header needs to be present in RESPONSE
if it is a tuple, the Header value must sastify at least one of the proposed values
email_address : email address to send result
smtp_host : smtp host to use
debug_level : log level of this utility (debug =>very verbose,
info=>normal,
warning=>nothing)
This utility requires wget => 1.12
And a callable varnishlog.
The utility must be run on same server where varnish is running.
web_checker reads varnishlogs to detect if a Query goes to the backend.
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import os
import shutil
import sys
import re
import shlex
from subprocess import Popen, PIPE, STDOUT
import logging
import tempfile
from datetime import datetime
import threading
import signal
_MARKER = []
class ProcessTimeoutException(Exception):
pass
def alarm_handler(signum, frame):
raise ProcessTimeoutException
class HTTPCacheCheckerTestSuite(object):
URL_CODE = 'url'
STATUS_CODE = 'status'
FILE_PATH_CODE = 'path'
OTHER_CODE = 'other'
LOG_LEVEL_DICT = {'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR,
'critical': logging.CRITICAL}
url_search_in_wget_regex = re.compile('^--\d{4}.*--\s+(?P<%s>.*)$' %\
URL_CODE)
status_search_in_wget_regex = re.compile('^HTTP request sent, awaiting'\
'response\.\.\. (?P<%s>\d+).+$' % STATUS_CODE)
file_save_search_regex = re.compile("^Saving to: `(?P<%s>.*)'" %\
FILE_PATH_CODE)
x_cache_header_search_regex = re.compile('X-Cache:\s(\S+)\s?$', re.MULTILINE)
x_varnish_header_search_regex = re.compile('X-Varnish:\s(\d+)', re.MULTILINE)
generic_header_search_regex = '%s:\s(.*)\s$'
ACCEPTABLE_STATUS_LIST = ('200', '304',)
def __init__(self, root_url, working_directory, varnishlog_binary_path,
header_list, email_address, smtp_host, debug_level):
"""
root_url : website to check
working_directory : where fetched data will be downloaded
varnishlog_binary_path : path to varnishlog
header_list : Key == Header id.
value: if equals True means header
needs to be present in RESPONSE
if this is tuple, the Header value must sastify
at least one of the proposed values
email_address : email address to send result
smtp_host : smtp host to use
debug_level : log level of this utility (debug =>very verbose,
info=>normal,
warning=> nothing)
"""
self.root_url = root_url
self.working_directory = working_directory
self.varnishlog_binary_path = varnishlog_binary_path
self.header_list = header_list
self.email_address = email_address
self.smtp_host = smtp_host
level = self.LOG_LEVEL_DICT.get(debug_level, logging.INFO)
logging.basicConfig(filename='erp5_web_checker.log', level=level)
self.report_list = []
self._timeout = 30
def _initFolder(self):
"""Delete and create workgin directory
"""
if os.path.isdir(self.working_directory):
logging.debug('Re-creating folder:%r' % self.working_directory)
shutil.rmtree(self.working_directory)
os.mkdir(self.working_directory)
if not os.path.isdir(self.working_directory):
logging.debug('Creating folder:%r' % self.working_directory)
os.mkdir(self.working_directory)
def _runVarnishLog(self):
"""Run varnishlog and listen comunications
"""
logging.info('Start varnishlog process')
command_string = '%s -d' % (self.varnishlog_binary_path)#, filename)
command_list = shlex.split(command_string)
process = Popen(command_list, stdin=PIPE, stdout=PIPE)
return process
def _readVarnishLog(self, process):
"""Add timeout support
"""
logging.info('Reading varnishlog with timeout:%r' % self._timeout)
def _kill_process(pid):
# support for python 2.5 and under.
# Shall use process.terminate() for python2.6 and above
os.kill(pid, signal.SIGTERM)
watcher = threading.Timer(self._timeout, _kill_process, args=(process.pid,))
watcher.start()
varnishlog_data, _ = process.communicate()
watcher.cancel()
return varnishlog_data
def _readVarnishLogAndGetIsBackendTouched(self, varnihslog_data,
x_varnish_reference_list):
"""Utility to parse such string:
14 ReqStart c 127.0.0.1 58470 385643239
14 RxRequest c GET
14 RxURL c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
14 RxProtocol c HTTP/1.1
14 RxHeader c Host: 127.0.0.1:6081
14 RxHeader c User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
14 RxHeader c Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
14 RxHeader c Accept-Language: fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3
14 RxHeader c Accept-Encoding: gzip,deflate
14 RxHeader c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
14 RxHeader c Referer: http://www.example.com/
14 RxHeader c If-Modified-Since: Thu, 25 Feb 2010 16:22:23 GMT
14 RxHeader c Via: 1.1 www.example.com
14 RxHeader c X-Forwarded-For: 82.226.226.226
14 RxHeader c X-Forwarded-Host: www.example.com
14 RxHeader c X-Forwarded-Server: www.example.com
14 RxHeader c Connection: Keep-Alive
14 VCL_call c recv
14 VCL_return c lookup
14 VCL_call c hash
14 VCL_return c hash
14 VCL_call c miss
14 VCL_return c fetch
14 Backend c 15 default default
15 TxRequest b GET
15 TxURL b /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
15 TxProtocol b HTTP/1.1
15 TxHeader b Host: 127.0.0.1:6081
15 TxHeader b User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
15 TxHeader b Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
15 TxHeader b Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
15 TxHeader b Referer: http://www.example.com/
15 TxHeader b Via: 1.1 www.example.com
15 TxHeader b X-Forwarded-For: 82.226.226.226
15 TxHeader b X-Forwarded-Host: www.example.com
15 TxHeader b X-Forwarded-Server: www.example.com
15 TxHeader b X-Varnish: 385643239
15 TxHeader b X-Forwarded-For: 127.0.0.1
16 SessionOpen c 127.0.0.1 58471 :6081
16 ReqStart c 127.0.0.1 58471 385643240
16 RxRequest c GET
16 RxURL c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/erp5_web_example_layout.css
16 RxProtocol c HTTP/1.1
16 RxHeader c Host: 127.0.0.1:6081
[...]
This script is able to know if the request goes to the backend or not according ID of request
ie: 385643239
if we are able to read such string "TxHeader b X-Varnish: 385643239"
means varnish send a query to the backend.
"""
for x_varnish_reference in x_varnish_reference_list:
backend_is_touched_regex = re.compile('TxHeader\s+b\sX-Varnish:\s%s' %\
x_varnish_reference[0])
match_object = backend_is_touched_regex.search(varnihslog_data)
backend_touched = match_object is not None
logging.debug('%r %r' % (backend_touched, x_varnish_reference,))
if backend_touched != x_varnish_reference[1]:
if backend_touched:
title = 'Error:Backend touched'
else:
title = 'Error Backend not touched'
message = '%s -> X-Varnish:%r' % (title, x_varnish_reference,)
self.report_list.append(message)
def _parseWgetLine(self, line):
"""return tuple (code, value)
code: 'url' means this is requested url
'status' means we can read status code
'other' something we can discard
"""
match_object = self.url_search_in_wget_regex.search(line)
if match_object is not None:
return self.URL_CODE, match_object.group(self.URL_CODE)
match_object = self.status_search_in_wget_regex.search(line)
if match_object is not None:
return self.STATUS_CODE, match_object.group(self.STATUS_CODE)
match_object = self.file_save_search_regex.search(line)
if match_object is not None:
return self.FILE_PATH_CODE, match_object.group(self.FILE_PATH_CODE)
return self.OTHER_CODE, line
def _runSpider(self):
"""Run wget in working_directory with headers in result
"""
wget_command_string = 'wget -r -nc --retry-connrefused --save-headers %s '\
% self.root_url
logging.debug('wget command:%r' % wget_command_string)
wget_argument_list = shlex.split(wget_command_string)
wget_process = Popen(wget_argument_list, stdin=PIPE,
stdout=PIPE, stderr=STDOUT,
env={'LANG': 'en_EN'}, # Force output messages
# in English
universal_newlines=True,
cwd=self.working_directory) # working directory
stdout, stderr = wget_process.communicate()
return stdout
def _parseWgetLogs(self, wget_log_file, discarded_url_list=_MARKER):
"""read wget logs and test Caching configuration
"""
if discarded_url_list is _MARKER:
first_pass = True
else:
first_pass = False
x_varnish_reference_list = []
for line in wget_log_file.splitlines():
logging.debug('wget output:%r' % line)
code, value = self._parseWgetLine(line)
if code == self.URL_CODE:
# This is the first Line by URL checked in wget stdout
url = value
logging.debug('url:%r' % url)
discarded = False
if not first_pass and url in discarded_url_list:
# URL already checked during first pass
logging.debug('%r Discarded' % url)
discarded = True
if discarded:
# keep reading wget process without doing anything
continue
if code == self.STATUS_CODE:
if value not in self.ACCEPTABLE_STATUS_LIST:
message = 'Page in error:%r status:%r' % (url, value)
if message not in self.report_list:
self.report_list.append(message)
if code == self.FILE_PATH_CODE:
# Here we check if Response was cached
file_path = os.path.sep.join((self.working_directory, value))
file_object = None
try:
file_object = open(file_path, 'r')
except IOError:
# This is probably a folder try with /index.html
index_file_path = file_path + os.path.sep + 'index.html'
try:
file_object = open(index_file_path, 'r')
except IOError:
# sometimes this is appended with .1
number_file_path = file_path + '.1'
try:
file_object = open(number_file_path, 'r')
except IOError:
logging.info('File not found for url:%r %r' %\
(url, (file_path, index_file_path, number_file_path),))
continue
fetched_data = file_object.read()
file_object.close()
x_cache_header_match_object =\
self.x_cache_header_search_regex.search(fetched_data)
if x_cache_header_match_object is None:
# This RESPONSE is not cached by Varnish
message = 'X-Cache header not found for %r' % url
self.report_list.append(message)
else:
# means X-Cache header is present in reponse
# Read the X-Varnish header to know if backend has been touched
x_varnish_match_object =\
self.x_varnish_header_search_regex.search(fetched_data)
x_varnish_reference = x_varnish_match_object.group(1)
logging.info('x_varnish_reference:%r for url:%r' %\
(x_varnish_reference, url))
hits = x_cache_header_match_object.group(1)
if hits.isdigit():
# This is a cached content with a positive hit value
# Check if request didn't goes to the backend
x_varnish_reference_list.append((x_varnish_reference, False, url))
# dot not check this url in second pass
logging.debug('will be discarded:%r' % url)
discarded_url_list.append(url)
else:
x_varnish_reference_list.append((x_varnish_reference, True, url))
for header, reference_value in self.header_list.iteritems():
re_compiled = re.compile(self.generic_header_search_regex % header,
re.MULTILINE)
match_object = re_compiled.search(fetched_data)
if match_object is None:
message = 'header:%r not found for %r' % (header, url)
self.report_list.append(message)
else:
read_value = match_object.group(1)
if reference_value is True and not read_value:
message = 'value of header:%r not found for %r' % (header, url)
self.report_list.append(message)
elif isinstance(reference_value, (tuple,list)):
if read_value not in reference_value:
message = 'value of header:%r does not match'\
' for %r (%r not in %r)' %\
(header, url, read_value, reference_value)
self.report_list.append(message)
return x_varnish_reference_list, discarded_url_list[:]
def start(self):
"""Run test suite
return errors
"""
logging.info('#'*52)
logging.info('date:%r' % (datetime.now().isoformat()))
logging.info('#'*52)
self._initFolder()
logging.info('First pass:%r' % self.root_url)
varnishlog_reading_process = self._runVarnishLog()
wget_log_file = self._runSpider()
varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
x_varnish_reference_list, discarded_url_list =\
self._parseWgetLogs(wget_log_file)
self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
x_varnish_reference_list)
logging.info('End of First pass\n')
[logging.debug(discarded_url) for discarded_url in discarded_url_list]
self._initFolder()
logging.info('Second pass:%r' % self.root_url)
varnishlog_reading_process = self._runVarnishLog()
wget_log_file = self._runSpider()
varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
x_varnish_reference_list, discarded_url_list =\
self._parseWgetLogs(wget_log_file,
discarded_url_list=discarded_url_list)
self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
x_varnish_reference_list)
logging.info('End of second pass\n')
if self.report_list:
report_message = 'Errors:\n\t%s' % '\n\t'.join(self.report_list)
signal = 'PROBLEM'
else:
report_message = 'No errors'
signal = 'OK'
if self.email_address is not None:
import smtplib
subject = 'HTTP Cache checker results for %s: %s' % (self.root_url,
signal)
message = 'Subject: %s\nFrom: %s\nTo: %s\n\n%s' %\
(subject, self.email_address, self.email_address, report_message)
server = smtplib.SMTP(self.smtp_host)
server.sendmail(self.email_address, self.email_address, message)
server.quit()
return 'Email sends to %s' % self.email_address
else:
return report_message
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment