Commit d4fc7062 authored by Romain Courteaud's avatar Romain Courteaud

Surykatka: first release

Check http status
parents
# URL checker contribution
## Install development environnment
```
python3 -m venv venv
. venv/bin/activate
pip install --upgrade --editable ".[dev]"
py.test
urlchecker -u URL
```
## Check the code
```
pyflakes src/surykatka/*py tests/*py setup.py
black -t py37 -l 79 src/surykatka/*py tests/*py setup.py
```
## SNI Support
https://stackoverflow.com/questions/18578439/using-requests-with-tls-doesnt-give-sni-support/18579484
\ No newline at end of file
This diff is collapsed.
# surykatka
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import io
import re
from setuptools import setup, find_packages
with io.open("src/surykatka/bot.py", "rt", encoding="utf8") as f:
version = re.search(r'__version__ = "(.*?)"', f.read()).group(1)
setup(
name="surykatka",
version=version,
license="GPLv3+",
author="Nexedi",
author_email="romain@nexedi.com",
long_description=__doc__,
packages=find_packages("src"),
package_dir={"": "src"},
include_package_data=False,
zip_safe=True,
python_requires=">=3.5",
install_requires=[
"setuptools>40.5.0",
"requests>2.20.0",
"forcediphttpsadapter",
"peewee>2.10.1",
"click>=7.0",
"dnspython",
"miniupnpc",
],
extras_require={
"dev": ["pytest", "black", "pyflakes", "mock", "httpretty"]
},
entry_points={
"console_scripts": ["surykatka=surykatka.cli:runSurykatka "]
},
)
This diff is collapsed.
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import click
import sys
from .bot import create_bot
@click.command(short_help="Runs surykatka bot.")
@click.option(
"--run",
"-r",
help="The bot operation mode to run.",
show_default=True,
default="status",
type=click.Choice(["crawl", "status"]),
)
@click.option(
"--sqlite", "-s", help="The path of the sqlite DB. (default: :memory:)"
)
@click.option("--nameserver", "-n", help="The IP of the DNS server.")
@click.option("--url", "-u", help="The url to check.")
@click.option("--domain", "-d", help="The domain to check.")
@click.option("--timeout", "-t", help="The timeout value.")
@click.option(
"--configuration", "-f", help="The path of the configuration file."
)
@click.option(
"--reload/--no-reload",
default=False,
help="Reload the configuration file between each crawl.",
show_default=True,
)
@click.option(
"--output",
"-o",
help="The status output format.",
type=click.Choice(["plain", "json"]),
default="plain",
show_default=True,
)
def runSurykatka(
run,
sqlite,
nameserver,
url,
domain,
timeout,
configuration,
reload,
output,
):
mapping = {}
if url:
mapping["URL"] = url
mapping["DOMAIN"] = ""
if domain:
mapping["DOMAIN"] = domain
if not url:
mapping["URL"] = ""
if sqlite:
mapping["SQLITE"] = sqlite
if nameserver:
mapping["NAMESERVER"] = nameserver
if reload:
mapping["RELOAD"] = str(reload)
mapping["FORMAT"] = output
bot = create_bot(cfgfile=configuration, mapping=mapping)
return bot.run(run)
if __name__ == "__main__":
sys.exit(runSurykatka())
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import configparser
import os
from dns.resolver import get_default_resolver
CONFIG_SECTION = "SURYKATKA"
def createConfiguration(
envvar="SURYKATKA_SETTINGS", cfgfile=None, mapping=None
):
config = configparser.ConfigParser(empty_lines_in_values=False)
# Default values
config[CONFIG_SECTION] = {"INTERVAL": -1, "DOMAIN": "", "URL": ""}
# User defined values
if (envvar is not None) and (envvar in os.environ):
config.read([os.environ.get(envvar)])
if cfgfile is not None:
config.read([cfgfile])
if mapping is not None:
config.read_dict({CONFIG_SECTION: mapping})
# Required values
if "SQLITE" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["SQLITE"] = ":memory:"
if "NAMESERVER" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["NAMESERVER"] = "\n".join(
get_default_resolver().nameservers
)
if "DOMAIN" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["DOMAIN"] = ""
if "URL" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["URL"] = ""
if "FORMAT" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["FORMAT"] = "json"
if "TIMEOUT" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["TIMEOUT"] = "1"
if "RELOAD" not in config[CONFIG_SECTION]:
config[CONFIG_SECTION]["RELOAD"] = str(False)
if config[CONFIG_SECTION]["SQLITE"] == ":memory:":
# Do not loop when using temporary DB
config[CONFIG_SECTION]["INTERVAL"] = "-1"
return config[CONFIG_SECTION]
def logConfiguration(db, status_id, config):
with db._db.atomic():
for key, value in config.items():
try:
# Check previous parameter value
previous_value = (
db.ConfigurationChange.select()
.where(db.ConfigurationChange.parameter == key)
.order_by(db.ConfigurationChange.status.desc())
.get()
.value
)
except db.ConfigurationChange.DoesNotExist:
previous_value = None
if previous_value != value:
db.ConfigurationChange.create(
status=status_id, parameter=key, value=value
)
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import peewee
from playhouse.migrate import migrate, SqliteMigrator
from playhouse.sqlite_ext import SqliteExtDatabase
import datetime
class LogDB:
def __init__(self, sqlite_path):
self._db = SqliteExtDatabase(
sqlite_path, pragmas=(("journal_mode", "WAL"), ("foreign_keys", 1))
)
self._db.connect()
class BaseModel(peewee.Model):
class Meta:
database = self._db
# This store the start, stop, loop time of the bot
# All other tables point to it to be able to group some info
class Status(BaseModel):
text = peewee.TextField()
timestamp = peewee.TimestampField(
primary_key=True,
# Store millisecond resolution
resolution=6,
# date is in UTC
utc=True,
default=datetime.datetime.now,
)
# Store the configuration modification
class ConfigurationChange(BaseModel):
status = peewee.ForeignKeyField(Status)
parameter = peewee.TextField(index=True)
value = peewee.TextField()
class Meta:
primary_key = peewee.CompositeKey("status", "parameter")
# indexes = (
# create a unique on from/to/date
# (('status', 'parameter'), True),
# )
# Store the configuration modification
class PlatformChange(BaseModel):
status = peewee.ForeignKeyField(Status)
parameter = peewee.TextField(index=True)
value = peewee.TextField()
class Meta:
primary_key = peewee.CompositeKey("status", "parameter")
# Store remote network status
class NetworkChange(BaseModel):
status = peewee.ForeignKeyField(Status)
ip = peewee.TextField()
transport = peewee.TextField()
port = peewee.IntegerField()
state = peewee.TextField()
class Meta:
primary_key = peewee.CompositeKey(
"status", "ip", "transport", "port"
)
class DnsChange(BaseModel):
status = peewee.ForeignKeyField(Status)
resolver_ip = peewee.TextField()
domain = peewee.TextField()
rdtype = peewee.TextField()
response = peewee.TextField()
class Meta:
primary_key = peewee.CompositeKey(
"status", "resolver_ip", "domain", "rdtype"
)
class HttpCodeChange(BaseModel):
status = peewee.ForeignKeyField(Status)
ip = peewee.TextField()
url = peewee.TextField()
status_code = peewee.IntegerField()
class Meta:
primary_key = peewee.CompositeKey("status", "ip", "url")
self.Status = Status
self.ConfigurationChange = ConfigurationChange
self.PlatformChange = PlatformChange
self.NetworkChange = NetworkChange
self.DnsChange = DnsChange
self.HttpCodeChange = HttpCodeChange
def createTables(self):
# http://www.sqlite.org/pragma.html#pragma_user_version
db_version = self._db.pragma("user_version")
expected_version = 1
if db_version == 0:
with self._db.transaction():
self._db.create_tables(
[
self.Status,
self.ConfigurationChange,
self.HttpCodeChange,
self.NetworkChange,
self.PlatformChange,
self.DnsChange,
]
)
self._db.pragma("user_version", expected_version)
elif db_version != expected_version:
# migrator = SqliteMigrator(self._db)
migration_list = []
sql_query_list = []
if migration_list or sql_query_list:
with self._db.transaction():
if migration_list:
migrate(*migration_list)
if sql_query_list:
for sql_query in sql_query_list:
self._db.execute_sql(
sql_query, require_commit=False
)
self._db.pragma("user_version", expected_version)
def close(self):
self._db.close()
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import dns
from .network import logNetwork
from peewee import fn
URL_TO_CHECK = "example.org"
TIMEOUT = 2
def reportDnsQuery(db, resolver_ip=None, domain=None, rdtype=None):
query = (
db.DnsChange.select(db.DnsChange)
.group_by(
db.DnsChange.resolver_ip, db.DnsChange.domain, db.DnsChange.rdtype
)
.having(db.DnsChange.status_id == fn.MAX(db.DnsChange.status_id))
)
if resolver_ip is not None:
if type(resolver_ip) == list:
query = query.where(db.DnsChange.resolver_ip << resolver_ip)
else:
query = query.where(db.DnsChange.resolver_ip == resolver_ip)
if domain is not None:
if type(domain) == list:
query = query.where(db.DnsChange.domain << domain)
else:
query = query.where(db.DnsChange.domain == domain)
if rdtype is not None:
if type(rdtype) == list:
query = query.where(db.DnsChange.rdtype << rdtype)
else:
query = query.where(db.DnsChange.rdtype == rdtype)
return query
def logDnsQuery(db, status_id, resolver_ip, domain_text, rdtype, answer_list):
answer_list.sort()
response = ", ".join(answer_list)
with db._db.atomic():
try:
# Check previous parameter value
previous_entry = reportDnsQuery(
db, resolver_ip=resolver_ip, domain=domain_text, rdtype=rdtype
).get()
except db.DnsChange.DoesNotExist:
previous_entry = None
if (previous_entry is None) or (previous_entry.response != response):
previous_entry = db.DnsChange.create(
resolver_ip=resolver_ip,
domain=domain_text,
rdtype=rdtype,
response=response,
status=status_id,
)
return previous_entry.status_id
def buildResolver(resolver_ip, timeout):
resolver = dns.resolver.Resolver(configure=False)
resolver.nameservers.append(resolver_ip)
resolver.timeout = timeout
resolver.lifetime = timeout
resolver.edns = -1
return resolver
def queryDNS(db, status_id, resolver_ip, domain_text, rdtype, timeout=TIMEOUT):
# only A (and AAAA) has address property
assert rdtype == "A"
resolver = buildResolver(resolver_ip, timeout)
try:
answer_list = [
x.address
for x in resolver.query(
domain_text, rdtype, raise_on_no_answer=False
)
]
except (
dns.resolver.NXDOMAIN,
dns.resolver.NoAnswer,
dns.exception.Timeout,
dns.resolver.NoNameservers,
):
answer_list = []
logDnsQuery(db, status_id, resolver_ip, domain_text, rdtype, answer_list)
return answer_list
def getReachableResolverList(db, status_id, resolver_ip_list, timeout=TIMEOUT):
# Create a list of resolver object
result_ip_list = []
# Check the DNS server availability once
# to prevent using it later if it is down
for resolver_ip in resolver_ip_list:
resolver_state = "open"
answer_list = queryDNS(
db, status_id, resolver_ip, URL_TO_CHECK, "A", timeout
)
if len(answer_list) == 0:
# We expect a valid response
# Drop the DNS server...
resolver_state = "closed"
else:
resolver_state = "open"
result_ip_list.append(resolver_ip)
logNetwork(db, resolver_ip, "UDP", 53, resolver_state, status_id)
return result_ip_list
def expandDomainList(domain_list):
for domain_text in domain_list:
dns_name = dns.name.from_text(domain_text)
if (len(dns_name.labels) - 1) > 2:
domain_list.append(dns_name.parent().to_text(omit_final_dot=True))
domain_list = list(set(domain_list))
domain_list.sort()
return domain_list
def getDomainIpDict(
db, status_id, resolver_ip_list, domain_list, rdtype, timeout=TIMEOUT
):
server_ip_dict = {}
for domain_text in domain_list:
for resolver_ip in resolver_ip_list:
answer_list = queryDNS(
db, status_id, resolver_ip, domain_text, rdtype, timeout
)
for address in answer_list:
if address not in server_ip_dict:
server_ip_dict[address] = []
if domain_text not in server_ip_dict[address]:
# Do not duplicate the domain
server_ip_dict[address].append(domain_text)
return server_ip_dict
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
import requests
from urllib.parse import urlparse, urlunsplit
from forcediphttpsadapter.adapters import ForcedIPHTTPSAdapter
from peewee import fn
PREFERRED_TYPE = "text/html"
TIMEOUT = 2
def getUrlHostname(url):
return urlparse(url).hostname
def getRootUrl(url):
parsed_url = urlparse(url)
return "%s://%s" % (parsed_url.scheme, parsed_url.hostname)
def getUserAgent(version):
return "%s/%s (+%s)" % (
"SURYKATKA",
version,
"https://lab.nexedi.com/nexedi/surykatka",
)
def request(url, timeout=TIMEOUT, headers=None, session=requests, version=0):
if headers is None:
headers = {}
if "Accept" not in headers:
headers["Accept"] = "%s;q=0.9,*/*;q=0.8" % PREFERRED_TYPE
if "User-Agent" not in headers:
# XXX user agent
headers["User-Agent"] = getUserAgent(version)
kwargs = {}
kwargs["stream"] = False
kwargs["timeout"] = timeout
kwargs["allow_redirects"] = False
kwargs["verify"] = True
args = ["GET", url]
kwargs["headers"] = headers
try:
response = session.request(*args, **kwargs)
except requests.exceptions.SSLError:
# XXX Enter into unknown host
response = requests.models.Response()
response.status_code = 526
except requests.exceptions.ConnectionError:
response = requests.models.Response()
response.status_code = 523
except requests.exceptions.Timeout:
response = requests.models.Response()
response.status_code = 524
except requests.exceptions.TooManyRedirects:
response = requests.models.Response()
response.status_code = 520
return response
def reportHttp(db, ip=None, url=None):
query = (
db.HttpCodeChange.select(db.HttpCodeChange)
.group_by(db.HttpCodeChange.ip, db.HttpCodeChange.url)
.having(
db.HttpCodeChange.status_id == fn.MAX(db.HttpCodeChange.status_id)
)
)
if ip is not None:
if type(ip) == list:
query = query.where(db.HttpCodeChange.ip << ip)
else:
query = query.where(db.HttpCodeChange.ip == ip)
if url is not None:
if type(url) == list:
query = query.where(db.HttpCodeChange.url << url)
else:
query = query.where(db.HttpCodeChange.url == url)
return query
def logHttpStatus(db, ip, url, code, status_id):
with db._db.atomic():
try:
# Check previous parameter value
previous_entry = reportHttp(db, ip=ip, url=url).get()
except db.HttpCodeChange.DoesNotExist:
previous_entry = None
if (previous_entry is None) or (previous_entry.status_code != code):
previous_entry = db.HttpCodeChange.create(
status=status_id, ip=ip, url=url, status_code=code
)
return previous_entry.status_id
def checkHttpStatus(db, status_id, url, ip, bot_version, timeout=TIMEOUT):
parsed_url = urlparse(url)
hostname = parsed_url.hostname
request_kw = {"timeout": timeout}
# SNI Support
if parsed_url.scheme == "https":
# Provide SNI support
base_url = urlunsplit(
(parsed_url.scheme, parsed_url.netloc, "", "", "")
)
session = requests.Session()
session.mount(base_url, ForcedIPHTTPSAdapter(dest_ip=ip))
request_kw["session"] = session
ip_url = url
elif parsed_url.scheme == "http":
# Force IP location
parsed_url = parsed_url._replace(netloc=ip)
ip_url = parsed_url.geturl()
else:
raise NotImplementedError("Unhandled url: %s" % url)
response = request(
ip_url, headers={"Host": hostname}, version=bot_version, **request_kw
)
logHttpStatus(db, ip, url, response.status_code, status_id)
# Copyright (C) 2019 Nexedi SA and Contributors.
# Romain Courteaud <romain@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by