Commit 284fb88f authored by Romain Courteaud's avatar Romain Courteaud

bot: report status with potential issue

* reduce information density
* skip url not to check
* warn 2 weeks before ssl certificate expiration
* skip ssl/server warning if not a critical url
parent 9223cbe2
......@@ -34,6 +34,7 @@ import email.utils
from collections import OrderedDict
from .ssl import hasValidSSLCertificate, reportSslCertificate
import datetime
from email.utils import parsedate_to_datetime
__version__ = "0.5.0"
......@@ -47,6 +48,94 @@ def rfc822(date):
return email.utils.format_datetime(date)
def filterWarningStatus(status_dict, interval, not_critical_url_list):
now = datetime.datetime.utcnow()
if interval < 60:
interval = 60
for i in range(len(status_dict["bot_status"]) - 1, -1, -1):
status_date = parsedate_to_datetime(
status_dict["bot_status"][i]["date"]
)
if (now - status_date).total_seconds() < (2 * interval):
# Skip the bot status if it was recently triggerer
del status_dict["bot_status"][i]
if not status_dict["bot_status"]:
del status_dict["bot_status"]
for i in range(len(status_dict["dns_server"]) - 1, -1, -1):
state = status_dict["dns_server"][i]["state"]
if state == "open":
del status_dict["dns_server"][i]
if not status_dict["dns_server"]:
del status_dict["dns_server"]
for i in range(len(status_dict["dns_query"]) - 1, -1, -1):
state = status_dict["dns_query"][i]["response"]
if state != "":
del status_dict["dns_query"][i]
if not status_dict["dns_query"]:
del status_dict["dns_query"]
if not status_dict["missing_data"]:
del status_dict["missing_data"]
for i in range(len(status_dict["http_server"]) - 1, -1, -1):
state = status_dict["http_server"][i]["state"]
# Skip if all domains lead to not critical urls
prefix = ""
if status_dict["http_server"][i]["port"] == 80:
prefix = "http://"
elif status_dict["http_server"][i]["port"] == 443:
prefix = "https://"
domain_list = status_dict["http_server"][i]["domain"].split(", ")
domain_list = [
x
for x in domain_list
if "%s%s" % (prefix, x) not in not_critical_url_list
]
if (state == "open") or (not domain_list):
del status_dict["http_server"][i]
if not status_dict["http_server"]:
del status_dict["http_server"]
for i in range(len(status_dict["ssl_certificate"]) - 1, -1, -1):
not_after = status_dict["ssl_certificate"][i]["not_after"]
if (
(not_after is not None)
and (
(60 * 60 * 24 * 14)
< (parsedate_to_datetime(not_after) - now).total_seconds()
)
) or (
("https://%s" % status_dict["ssl_certificate"][i]["hostname"])
in not_critical_url_list
):
# Warn 2 weeks before expiration
# Skip if we check only the http url
del status_dict["ssl_certificate"][i]
else:
# Drop columns with too much info
del status_dict["ssl_certificate"][i]["not_before"]
del status_dict["ssl_certificate"][i]["issuer"]
del status_dict["ssl_certificate"][i]["sha1_fingerprint"]
del status_dict["ssl_certificate"][i]["subject"]
if not status_dict["ssl_certificate"]:
del status_dict["ssl_certificate"]
for i in range(len(status_dict["http_query"]) - 1, -1, -1):
http_code = status_dict["http_query"][i]["status_code"]
if (http_code != 404) and (http_code < 500):
del status_dict["http_query"][i]
elif status_dict["http_query"][i]["url"] in not_critical_url_list:
del status_dict["http_query"][i]
else:
# Drop columns with too much info
del status_dict["http_query"][i]["http_header_dict"]
del status_dict["http_query"][i]["total_seconds"]
if not status_dict["http_query"]:
del status_dict["http_query"]
class WebBot:
def __init__(self, **kw):
self.config_kw = kw
......@@ -81,6 +170,22 @@ class WebBot:
public_suffix_list=self.config["PUBLIC_SUFFIX"].split(),
)
def calculateNotCriticalUrlList(self):
domain_list = self.config["DOMAIN"].split()
url_list = self.config["URL"].split()
not_critical_url_list = []
for url in url_list:
hostname = getUrlHostname(url)
if hostname is not None:
if hostname not in domain_list:
# Domain not explicitely checked
# Skip both root url
for protocol in ("http", "https"):
not_critical_url = "%s://%s" % (protocol, hostname)
if not_critical_url not in url_list:
not_critical_url_list.append(not_critical_url)
return not_critical_url_list
def iterateLoop(self):
status_id = logStatus(self._db, "loop")
......@@ -362,19 +467,22 @@ class WebBot:
def run(self, mode):
status_dict = None
if mode not in ["crawl", "status"]:
if mode not in ["crawl", "status", "warning"]:
raise NotImplementedError("Unexpected mode: %s" % mode)
if self.config["SQLITE"] == ":memory:":
# Crawl/report are mandatory when using memory
mode = "all"
if mode == "warning":
mode = "wallwarning"
else:
mode = "all"
self.initDB()
try:
if mode in ["crawl", "all"]:
if mode in ["crawl", "wallwarning", "all"]:
self.crawl()
if mode in ["status", "all"]:
if mode in ["status", "all", "wallwarning", "warning"]:
status_dict = self.status()
except:
self.closeDB()
......@@ -383,6 +491,12 @@ class WebBot:
self.closeDB()
if status_dict is not None:
if mode in ("wallwarning", "warning"):
filterWarningStatus(
status_dict,
int(self.config.get("INTERVAL")),
self.calculateNotCriticalUrlList(),
)
if self.config["FORMAT"] == "json":
print(json.dumps(status_dict))
else:
......
......@@ -29,7 +29,7 @@ from .bot import create_bot
help="The bot operation mode to run.",
show_default=True,
default="status",
type=click.Choice(["crawl", "status"]),
type=click.Choice(["crawl", "status", "warning"]),
)
@click.option(
"--sqlite", "-s", help="The path of the sqlite DB. (default: :memory:)"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment