Commit 3c8cda96 authored by Tatuya Kamada's avatar Tatuya Kamada

Import erp5.utils.web_checker from svn.erp5.org:public/erp5/trunk/utils

parents 34ae978b 55ce38e0
......@@ -4,7 +4,10 @@ Changes
0.3 (unreleased)
----------------
* No changes yet
* Imported from https://svn.erp5.org/repos/public/erp5/trunk/utils/
- erp5.util.webchecker:
Utility to check caching policy of websites
0.2 (2011-09-20)
----------------
......
Changelog
=========
0.0.9 (unreleased)
------------------
0.0.8 (2010-10-19)
------------------
- same as 0.0.7
[Nicolas Delaby]
0.0.7 (2010-10-19)
------------------
- Bug fix: typo
0.0.6 (2010-10-19)
------------------
- Bug fix about RawConfigParser
[Nicolas Delaby]
2010-08-13 Nicolas Delaby
-------------------------
small Bug fix.
Add option to export outpout into file
This option take precedence over mailing support.
2010-08-04 Nicolas Delaby
-------------------------
Bug fix in status_search_in_wget_regex
Enhance output report (fetch all errors per url)
Add 2 options to check Urls
prohibited_file_name_list
prohibited_folder_name_list
2010-08-03 Nicolas Delaby
-------------------------
First release
Utility able to call wget and varnishlog to extract Headers and return all failures
according expected caching policy.
This utility is configurable through a configuration file like:
[web_checker]
url = http://www.example.com/
working_directory = /home/me/tmp/crawled_content
varnishlog_binary_path = varnishlog
email_address = me@example.com
smtp_host = localhost
debug_level = debug
[header_list]
Last-Modified = True
Cache-Control = max-age=300
max-age=3600
Vary = Accept-Language, Cookie, Accept-Encoding
Accept-Language, Cookie
Accept-Language,Cookie,Accept-Encoding
Accept-Language,Cookie
Expires = True
[erp5_extension_list]
prohibited_file_name_list = WebSection_viewAsWeb
Base_viewHistory
list
prohibited_folder_name_list = web_page_module
document_module
with
url : website to check
working_directory : fetched data will be downloaded
varnishlog_binary_path : path to varnishlog
email_address : email address to send result
smtp_host : smtp host to use
debug_level : log level of this utility (debug =>very verbose,
info=>normal,
warning=>nothing)
header_list : Key == Header id.
value: if equals to True, it means that header needs to be present in RESPONSE
if it is a tuple, the Header value must sastify at least one of the proposed values
erp5_extension_list: Optional section.
prohibited_file_name_list: which check that any links redirect to prohibited forms
like WebSection_viewAsWeb, Base_viewHistory, list, ...
prohibited_folder_name_list: usefull to detect that links does not redirect to
specified modules like, web_page_module, document_module, ...
This utility requires wget => 1.12
And a callable varnishlog.
The utility must be run on same server where varnish is running.
web_checker reads varnishlogs to detect if a Query goes to the backend.
# -*- coding: utf-8 -*-
##############################################################################
#
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
# Nicolas Delaby <nicolas@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import os
import shutil
import sys
import re
import shlex
from subprocess import Popen, PIPE, STDOUT
import logging
import tempfile
from datetime import datetime
import threading
import signal
_MARKER = []
class ProcessTimeoutException(Exception):
pass
def alarm_handler(signum, frame):
raise ProcessTimeoutException
class HTTPCacheCheckerTestSuite(object):
URL_CODE = 'url'
STATUS_CODE = 'status'
FILE_PATH_CODE = 'path'
OTHER_CODE = 'other'
LOG_LEVEL_DICT = {'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR,
'critical': logging.CRITICAL}
url_search_in_wget_regex = re.compile('^--\d{4}.*--\s+(?P<%s>.*)$' %\
URL_CODE)
status_search_in_wget_regex = re.compile('^HTTP request sent, awaiting '\
'response\.\.\. (?P<%s>\d+).+$' % STATUS_CODE)
file_save_search_regex = re.compile("^Saving to: `(?P<%s>.*)'" %\
FILE_PATH_CODE)
x_cache_header_search_regex = re.compile('X-Cache:\s(\S+)\s?$', re.MULTILINE)
x_varnish_header_search_regex = re.compile('X-Varnish:\s(\d+)', re.MULTILINE)
generic_header_search_regex = '%s:\s(.*)\s$'
ACCEPTABLE_STATUS_LIST = ('200', '304', '302',)
def __init__(self, root_url, working_directory, varnishlog_binary_path,
wget_binary_path, header_list, email_address, smtp_host,
debug_level, file_log_path):
"""
root_url : website to check
working_directory : where fetched data will be downloaded
varnishlog_binary_path : path to varnishlog
wget_binary_path : path to wget command
header_list : Key == Header id.
value: if equals True means header
needs to be present in RESPONSE
if this is tuple, the Header value must sastify
at least one of the proposed values
email_address : email address to send result
smtp_host : smtp host to use
debug_level : log level of this utility (debug =>very verbose,
info=>normal,
warning=> nothing)
file_log_path: path to log file
"""
self.root_url = root_url
self.working_directory = working_directory
self.varnishlog_binary_path = varnishlog_binary_path
self.wget_binary_path = wget_binary_path
self.header_list = header_list
self.email_address = email_address
self.smtp_host = smtp_host
level = self.LOG_LEVEL_DICT.get(debug_level, logging.INFO)
logging.basicConfig(filename=file_log_path, level=level)
self.report_dict = {}
self._timeout = 30
def _initFolder(self):
"""Delete and create workgin directory
"""
if os.path.isdir(self.working_directory):
logging.debug('Re-creating folder:%r' % self.working_directory)
shutil.rmtree(self.working_directory)
os.mkdir(self.working_directory)
if not os.path.isdir(self.working_directory):
logging.debug('Creating folder:%r' % self.working_directory)
os.mkdir(self.working_directory)
def _runVarnishLog(self):
"""Run varnishlog and listen comunications
"""
logging.info('Start varnishlog process')
command_string = '%s -d' % (self.varnishlog_binary_path)#, filename)
command_list = shlex.split(command_string)
process = Popen(command_list, stdin=PIPE, stdout=PIPE)
return process
def _readVarnishLog(self, process):
"""Add timeout support
"""
logging.info('Reading varnishlog with timeout:%r' % self._timeout)
def _kill_process(pid):
# support for python 2.5 and under.
# Shall use process.terminate() for python2.6 and above
os.kill(pid, signal.SIGTERM)
watcher = threading.Timer(self._timeout, _kill_process, args=(process.pid,))
watcher.start()
varnishlog_data, _ = process.communicate()
watcher.cancel()
return varnishlog_data
def _readVarnishLogAndGetIsBackendTouched(self, varnihslog_data,
x_varnish_reference_list):
"""Utility to parse such string:
14 ReqStart c 127.0.0.1 58470 385643239
14 RxRequest c GET
14 RxURL c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
14 RxProtocol c HTTP/1.1
14 RxHeader c Host: 127.0.0.1:6081
14 RxHeader c User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
14 RxHeader c Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
14 RxHeader c Accept-Language: fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3
14 RxHeader c Accept-Encoding: gzip,deflate
14 RxHeader c Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
14 RxHeader c Referer: http://www.example.com/
14 RxHeader c If-Modified-Since: Thu, 25 Feb 2010 16:22:23 GMT
14 RxHeader c Via: 1.1 www.example.com
14 RxHeader c X-Forwarded-For: 82.226.226.226
14 RxHeader c X-Forwarded-Host: www.example.com
14 RxHeader c X-Forwarded-Server: www.example.com
14 RxHeader c Connection: Keep-Alive
14 VCL_call c recv
14 VCL_return c lookup
14 VCL_call c hash
14 VCL_return c hash
14 VCL_call c miss
14 VCL_return c fetch
14 Backend c 15 default default
15 TxRequest b GET
15 TxURL b /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/ga/account/
15 TxProtocol b HTTP/1.1
15 TxHeader b Host: 127.0.0.1:6081
15 TxHeader b User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.7) Gecko/20100110 Mandriva Linux/1.9.1.7-0.1mdv2010.0 (2010.0) Firefox/3.5.7
15 TxHeader b Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
15 TxHeader b Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
15 TxHeader b Referer: http://www.example.com/
15 TxHeader b Via: 1.1 www.example.com
15 TxHeader b X-Forwarded-For: 82.226.226.226
15 TxHeader b X-Forwarded-Host: www.example.com
15 TxHeader b X-Forwarded-Server: www.example.com
15 TxHeader b X-Varnish: 385643239
15 TxHeader b X-Forwarded-For: 127.0.0.1
16 SessionOpen c 127.0.0.1 58471 :6081
16 ReqStart c 127.0.0.1 58471 385643240
16 RxRequest c GET
16 RxURL c /web/VirtualHostBase/http/www.example.com:80/erp5/web_site_module/example_website/VirtualHostRoot/_vh_/erp5_web_example_layout.css
16 RxProtocol c HTTP/1.1
16 RxHeader c Host: 127.0.0.1:6081
[...]
This script is able to know if the request goes to the backend or not according ID of request
ie: 385643239
if we are able to read such string "TxHeader b X-Varnish: 385643239"
means varnish send a query to the backend.
"""
for x_varnish_reference in x_varnish_reference_list:
backend_is_touched_regex = re.compile('TxHeader\s+b\sX-Varnish:\s%s' %\
x_varnish_reference[0])
match_object = backend_is_touched_regex.search(varnihslog_data)
backend_touched = match_object is not None
logging.debug('%r %r' % (backend_touched, x_varnish_reference,))
if backend_touched != x_varnish_reference[1]:
if backend_touched:
title = 'Error:Backend touched'
else:
title = 'Error Backend not touched'
message = '%s -> X-Varnish:%r' % (title, x_varnish_reference,)
self.report_dict.setdefault(x_varnish_reference[2],
[]).append(message)
def _parseWgetLine(self, line):
"""return tuple (code, value)
code: 'url' means this is requested url
'status' means we can read status code
'other' something we can discard
"""
match_object = self.url_search_in_wget_regex.search(line)
if match_object is not None:
return self.URL_CODE, match_object.group(self.URL_CODE)
match_object = self.status_search_in_wget_regex.search(line)
if match_object is not None:
return self.STATUS_CODE, match_object.group(self.STATUS_CODE)
match_object = self.file_save_search_regex.search(line)
if match_object is not None:
return self.FILE_PATH_CODE, match_object.group(self.FILE_PATH_CODE)
return self.OTHER_CODE, line
def _runSpider(self):
"""Run wget in working_directory with headers in result
"""
wget_command_string = '%s -r -nc --retry-connrefused --save-headers %s '\
% (self.wget_binary_path, self.root_url)
logging.debug('wget command:%r' % wget_command_string)
wget_argument_list = shlex.split(wget_command_string)
wget_process = Popen(wget_argument_list, stdin=PIPE,
stdout=PIPE, stderr=STDOUT,
env={'LANG': 'en_EN'}, # Force output messages
# in English
universal_newlines=True,
cwd=self.working_directory) # working directory
stdout, stderr = wget_process.communicate()
return stdout
def _parseWgetLogs(self, wget_log_file, discarded_url_list=_MARKER,
prohibited_file_name_list=None,
prohibited_folder_name_list=None):
"""read wget logs and test Caching configuration
"""
if discarded_url_list is _MARKER:
first_pass = True
else:
first_pass = False
x_varnish_reference_list = []
for line in wget_log_file.splitlines():
logging.debug('wget output:%r' % line)
code, value = self._parseWgetLine(line)
if code == self.URL_CODE:
# This is the first Line by URL checked in wget stdout
url = value
logging.debug('url:%r' % url)
discarded = False
if not first_pass and url in discarded_url_list:
# URL already checked during first pass
logging.debug('%r Discarded' % url)
discarded = True
if discarded:
# keep reading wget process without doing anything
continue
if code == self.STATUS_CODE:
if value not in self.ACCEPTABLE_STATUS_LIST:
message = 'Page in error status:%r' % (value)
self.report_dict.setdefault(url, []).append(message)
if code == self.FILE_PATH_CODE:
# Here we check if Response was cached
file_path = os.path.sep.join((self.working_directory, value))
folder_path , filename = os.path.split(file_path)
if prohibited_file_name_list:
if '?' in filename:
filename = filename.rpartition('?')[0]
if filename in prohibited_file_name_list:
# Error
message = '%r is prohibited as filename' % filename
self.report_dict.setdefault(url, []).append(message)
if prohibited_folder_name_list:
for prohibited_folder_name in prohibited_folder_name_list:
if prohibited_folder_name in folder_path.split(os.path.sep):
message = '%r is prohibited as folder name' % prohibited_folder_name
self.report_dict.setdefault(url, []).append(message)
break
file_object = None
try:
file_object = open(file_path, 'r')
except IOError:
# This is probably a folder try with /index.html
index_file_path = file_path + os.path.sep + 'index.html'
try:
file_object = open(index_file_path, 'r')
except IOError:
# sometimes this is appended with .1
number_file_path = file_path + '.1'
try:
file_object = open(number_file_path, 'r')
except IOError:
logging.info('File not found for url:%r %r' %\
(url, (file_path, index_file_path, number_file_path),))
continue
fetched_data = file_object.read()
file_object.close()
x_cache_header_match_object =\
self.x_cache_header_search_regex.search(fetched_data)
if x_cache_header_match_object is None:
# This RESPONSE is not cached by Varnish
message = 'X-Cache header not found'
self.report_dict.setdefault(url, []).append(message)
else:
# means X-Cache header is present in reponse
# Read the X-Varnish header to know if backend has been touched
x_varnish_match_object =\
self.x_varnish_header_search_regex.search(fetched_data)
x_varnish_reference = x_varnish_match_object.group(1)
logging.debug('x_varnish_reference:%r for url:%r' %\
(x_varnish_reference, url))
hits = x_cache_header_match_object.group(1)
if hits.isdigit():
# This is a cached content with a positive hit value
# Check if request didn't goes to the backend
x_varnish_reference_list.append((x_varnish_reference, False, url))
# dot not check this url in second pass
logging.debug('will be discarded:%r' % url)
discarded_url_list.append(url)
else:
x_varnish_reference_list.append((x_varnish_reference, True, url))
for header, reference_value in self.header_list.iteritems():
re_compiled = re.compile(self.generic_header_search_regex % header,
re.MULTILINE | re.IGNORECASE)
match_object = re_compiled.search(fetched_data)
if match_object is None:
message = 'header:%r not found' % (header)
self.report_dict.setdefault(url, []).append(message)
else:
read_value = match_object.group(1)
if reference_value is True and not read_value:
message = 'value of header:%r not found' % (header)
self.report_dict.setdefault(url, []).append(message)
elif isinstance(reference_value, (tuple,list)):
if read_value not in reference_value:
message = 'value of header:%r does not match'\
' (%r not in %r)' %\
(header, read_value, reference_value)
self.report_dict.setdefault(url, []).append(message)
return x_varnish_reference_list, discarded_url_list[:]
def start(self, prohibited_file_name_list=None,
prohibited_folder_name_list=None):
"""Run test suite
return errors
"""
logging.info('#'*52)
logging.info('date:%r' % (datetime.now().isoformat()))
logging.info('#'*52)
self._initFolder()
logging.info('First pass:%r' % self.root_url)
varnishlog_reading_process = self._runVarnishLog()
wget_log_file = self._runSpider()
varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
x_varnish_reference_list, discarded_url_list =\
self._parseWgetLogs(wget_log_file)
self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
x_varnish_reference_list)
logging.info('End of First pass\n')
[logging.debug(discarded_url) for discarded_url in discarded_url_list]
self._initFolder()
logging.info('Second pass:%r' % self.root_url)
varnishlog_reading_process = self._runVarnishLog()
wget_log_file = self._runSpider()
varnishlog_data = self._readVarnishLog(varnishlog_reading_process)
x_varnish_reference_list, discarded_url_list =\
self._parseWgetLogs(wget_log_file,
discarded_url_list=discarded_url_list,
prohibited_file_name_list=prohibited_file_name_list,
prohibited_folder_name_list=prohibited_folder_name_list)
self._readVarnishLogAndGetIsBackendTouched(varnishlog_data,
x_varnish_reference_list)
logging.info('End of second pass\n')
if self.report_dict:
report_message_list = ['*Errors*:']
for url, message_list in self.report_dict.iteritems():
unique_message_list = []
for message in message_list:
if message not in unique_message_list:
unique_message_list.append(message)
report_message_list.append('\n')
report_message_list.append(url)
report_message_list.extend(['\t%s' % message for message\
in unique_message_list])
report_message = '\n'.join(report_message_list)
signal = 'PROBLEM'
else:
report_message = 'No errors'
signal = 'OK'
subject = '%r:HTTP Cache checker results for %s' % (signal, self.root_url)
if self.email_address:
import smtplib
message = 'Subject: %s\nFrom: %s\nTo: %s\n\n%s' %\
(subject, self.email_address, self.email_address, report_message)
server = smtplib.SMTP(self.smtp_host)
server.sendmail(self.email_address, self.email_address, message)
server.quit()
return 'Email sended to %s' % self.email_address
else:
return subject + '\n' + report_message
from optparse import OptionParser
import ConfigParser
def web_checker_utility():
usage = "usage: %prog [options] config_path"
parser = OptionParser(usage=usage)
parser.add_option('-o', '--output_file',
dest='output_file')
(options, args) = parser.parse_args()
if len(args) != 1 :
print parser.print_help()
parser.error('incorrect number of arguments')
config_path = args[0]
config = ConfigParser.RawConfigParser(defaults=dict(wget_binary_path='wget',
file_log_path='web_checker.log'))
config.read(config_path)
working_directory = config.get('web_checker', 'working_directory')
url = config.get('web_checker', 'url')
varnishlog_binary_path = config.get('web_checker', 'varnishlog_binary_path')
wget_binary_path = config.get('web_checker', 'wget_binary_path')
email_address = config.get('web_checker', 'email_address')
smtp_host = config.get('web_checker', 'smtp_host')
debug_level = config.get('web_checker', 'debug_level')
file_log_path = config.get('web_checker', 'file_log_path')
header_list = {}
for header, configuration in config.items('header_list'):
if header in config.defaults().keys():
# defaults are shared for all sections.
# so discard them from header_list
continue
if configuration in ('True', 'true', 'yes'):
value = True
else:
value = configuration.splitlines()
header_list[header] = value
if config.has_section('erp5_extension_list'):
prohibited_file_name_list = config.get('erp5_extension_list',
'prohibited_file_name_list').splitlines()
prohibited_folder_name_list = config.get('erp5_extension_list',
'prohibited_folder_name_list').splitlines()
else:
prohibited_file_name_list = prohibited_folder_name_list = []
instance = HTTPCacheCheckerTestSuite(url,
working_directory,
varnishlog_binary_path,
wget_binary_path,
header_list,
email_address,
smtp_host,
debug_level,
file_log_path)
result = instance.start(prohibited_file_name_list=prohibited_file_name_list,
prohibited_folder_name_list=prohibited_folder_name_list)
if options.output_file:
file_object = open(options.output_file, 'w')
file_object.write(result)
file_object.close()
else:
print result
......@@ -61,6 +61,7 @@ setup(name=name,
'performance_tester_erp5 = erp5.util.benchmark.performance_tester:main [benchmark]',
'scalability_tester_erp5 = erp5.util.benchmark.scalability_tester:main [scalability_tester]',
'generate_erp5_tester_report = erp5.util.benchmark.report:generateReport [benchmark-report]',
'web_checker_utility = erp5.util.webchecker:web_checker_utility [webchecker]'
],
}
)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment