Commit c9867831 authored by Jérome Perrin's avatar Jérome Perrin

grafana split

parent a9cebcfd
......@@ -15,7 +15,16 @@
[instance-profile]
filename = instance.cfg.in
md5sum = e4d5ac3e6ad239d3bf48c2b3172919b5
md5sum = 32c772c593d2c3c38c26186b91b78cf8
[instance-default]
filename = instance-default.cfg.in
md5sum = 5a9650b8654b2aeaeab076b08b248b70
[instance-agent]
filename = instance-agent.cfg.in
md5sum = edb4eeb900ad13a3b3a6e174c1ea533b
[influxdb-config-file]
filename = influxdb-config-file.cfg.in
......@@ -23,9 +32,10 @@ md5sum = a28972ced3e0f4aa776e43a9c44717c0
[grafana-config-file]
filename = grafana-config-file.cfg.in
md5sum = 83a8445858eab21a12f1769c23424bea
md5sum = 2b75d6b1984d9d154303ec773aa88474
[grafana-provisioning-dashboards-config-file]
filename = grafana-provisioning-dashboards-config-file.cfg.in
md5sum = 5616679a9c5c2757540175ead3f5500a
......@@ -334,20 +334,21 @@ allow_sign_up = true
#################################### SMTP / Emailing #####################
[smtp]
{% set email = slapparameter_dict.get('email', {}) %}
#enabled = false
enabled = {{ slapparameter_dict.get('smtp-server') and 'true' or 'false' }}
enabled = {{ email.get('smtp-server') and 'true' or 'false' }}
#host = locahost:25
host = {{ slapparameter_dict.get('smtp-server', '') }}
host = {{ email.get('smtp-server', '') }}
#user =
user = {{ slapparameter_dict.get('smtp-username', '') }}
user = {{ email.get('smtp-username', '') }}
# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;"""
#password =
password = {{ slapparameter_dict.get('smtp-password', '') and '"""%s"""' % slapparameter_dict['smtp-password'] or ""}}
password = {{ email.get('smtp-password', '') and '"""%s"""' % email['smtp-password'] or ""}}
cert_file =
key_file =
skip_verify = {{ slapparameter_dict.get('smtp-verify-ssl') and 'true' or 'false' }}
from_address = {{ slapparameter_dict.get('email-from-address', '') }}
from_name = {{ slapparameter_dict.get('email-from-name', 'Grafana') }}
skip_verify = {{ email.get('smtp-verify-ssl') and 'false' or 'true' }}
from_address = {{ email.get('email-from-address', '') }}
from_name = {{ email.get('email-from-name', 'Grafana') }}
ehlo_identity =
[emails]
......
{
"$schema": "http://json-schema.org/draft-04/schema",
"$schema": "https://json-schema.org/draft/2019-09/schema",
"description": "Parameters to instantiate an agent collecting logs and metrics",
"type": "object",
"additionalProperties": false,
"unevaluatedProperties": false,
"$defs": {
"type": {
"description": "Type of the application. With `SlapOS` type, some metrics are collected from supervisor and from some known partition types (for example: ERP5's mariadb or ERP5's zopes). With `system` type, only log files are ingested.",
......@@ -36,7 +37,7 @@
"description": "Static tags for this partition",
"examples": [
{
"region": "eu",
"service-level": "production",
"data-center": "abc123"
}
]
......@@ -76,7 +77,11 @@
},
"instance-root": {
"description": "Directory containing SlapOS partitions.",
"type": "string"
"type": "string",
"examples": [
"/srv/slapgrid/",
"/srv/slapgrid/slappart30/srv/runner/instance/"
]
},
"partitions": {
"description": "SlapOS partitions to monitor",
......@@ -87,7 +92,7 @@
"name",
"reference"
],
"additionalProperties": false,
"unevaluatedProperties": false,
"properties": {
"name": {
"type": "string",
......@@ -120,13 +125,39 @@
"default": "default"
},
"log-file-patterns": {
"$refs": "#/$defs/log-file-patterns",
"$ref": "#/$defs/log-file-patterns",
"description": "Glob pattern for log files to watch. This mostly makes sense for `default` partition type"
},
"static-tags": {
"$refs": "#/$defs/static-tags"
"$ref": "#/$defs/static-tags"
}
},
"allOf": [
{
"if": {
"properties": {
"type": {
"enum": [
"mariadb",
"erp5/mariadb"
]
}
}
},
"then": {
"properties": {
"dbname": {
"type": "string",
"description": "Database name"
},
"username": {
"type": "string",
"description": "Username to connect to database"
}
}
}
}
],
"examples": [
{
"name": "zope-backoffice",
......@@ -187,10 +218,10 @@
]
},
"log-file-patterns": {
"$refs": "#/$defs/log-file-patterns"
"$ref": "#/$defs/log-file-patterns"
},
"static-tags": {
"$refs": "#/$defs/static-tags"
"$ref": "#/$defs/static-tags"
}
},
"examples": [
......
......@@ -6,6 +6,15 @@
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"type": "string"
},
"promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
}
},
"type": "object"
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
import urllib.parse
influxdb = options['slapparameter-dict']['influxdb']
options['url'] = influxdb['url']
options['database'] = influxdb['database']
options['auth-username'] = influxdb['username']
options['auth-password'] = influxdb['password']
parsed_url = urllib.parse.urlparse(options['url'])
options['hostname'] = parsed_url.hostname
options['port'] = str(parsed_url.port)
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb-server:hostname}
port = ${influxdb-server:port}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && ${:nice} {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
hash-files = ${telegraf-config-file:output}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
[telegraf-config-file]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
input-socket = ${directory:var}/tg.sock
init =
import zc.buildout
import pkg_resources
buildout_options = self.buildout["buildout"]
zc.buildout.easy_install.install(
["toml"],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options["develop-eggs-directory"],
buildout_options["eggs-directory"]])
import collections
import pathlib
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb-server']
self._config_files = {} # files to create during install step
access_path_dict = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
for application in slapparameter_dict.get("applications", []):
partition_mapping = {}
partition_directory = ''
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
if "reference" in partition:
partition_mapping[partition["reference"]] = partition["name"]
if application.get("instance-root"):
partition_directory = pathlib.Path(application["instance-root"]) / partition['reference']
if partition["type"] in ("erp5/mariadb", "mariadb"):
partition.setdefault("username", "root")
partition.setdefault("dbname", "erp5")
mariadb_socket = f"{partition_directory}/var/run/mariadb.sock"
dsn = f"{partition['username']}@unix({mariadb_socket})/{partition['dbname']}"
access_path_dict[mariadb_socket] = 'rw'
inputs["mysql"].append(
{
"servers": [dsn],
"gather_innodb_metrics": True,
"gather_slave_status": True,
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
inputs["sql"].append(
{
"name_override": "mariadb-activities",
"driver": "mysql",
"dsn": dsn,
"query": [
{
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/balancer":
# XXX do we really want this one ?
access_log = f"{partition_directory}/var/log/apache-access.log"
access_path_dict[access_log] = 'r'
inputs["tail"].append(
{
"data_format": "grok",
"files": [access_log],
"grok_custom_pattern_files": [],
"grok_custom_patterns": "",
"grok_patterns": [
'%{IPORHOST:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \\[%{HTTPDATE:timestamp}\\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) %{QS:referrer} %{QS:agent} %{NUMBER:response_time:int}'
],
"grok_timezone": "Local",
"name_override": "haproxy_logs"
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
inputs["haproxy"].append(
{
"servers": [f"{partition_directory}/var/run/ha.sock"],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
})
urls = application.get("urls", [])
if urls:
inputs["http_response"].append({
"interval": "5m",
"urls": urls,
"tags": {"app": application["name"]},
})
for url in urls:
x509_url = url
parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme == 'https':
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = str(
pathlib.Path(self.options['location'])
/ f"telegraf-input-slapos-{application['name']}.cfg"
)
self._config_files[telegraf_slapos_input_config_file] = toml.dumps({
"inputs": {
"slapos": [{
"instance_root": application["instance-root"]}]}})
access_path_dict[f"{application['instance-root']}/sv.sock"] = 'rw'
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": "slapos_services",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": ["slapos_services"],
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": ["slapos_services"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": ["slapos_services"],
"order": 3,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": ["slapos_services"],
"order": 4,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": ["slapos_services"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": ["slapos_services"],
"order": 6,
"mapping": [{
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
# add a socket input so that we can have a promise verifying that telegraf is listening
inputs['socket_listener'].append({"service_address": f"unix://{self.options['input-socket']}"})
options['access-path-dict'] = access_path_dict
self._config_files[options['output']] = toml.dumps(config)
install =
import os
os.mkdir(self.options['location'])
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
[loki-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
loki = options['slapparameter-dict']['loki']
options['url'] = loki['url']
options['caucase-url'] = loki['caucase-url']
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:cn}
L = ${slap-connection:computer-id}
O = ${slap-connection:partition-id}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
cn = loki ${slap-connection:partition-id}@${slap-connection:computer-id}
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-server:caucase-url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[promtail]
recipe = slapos.cookbook:wrapper
command-line = ${:nice} {{ promtail_bin }} -config.file=${promtail-config-file:location}
wrapper-path = ${directory:service}/promtail
hash-files =
${promtail-config-file:location}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.yaml
slapparameter-dict = ${slap-configuration:configuration}
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
init =
import pathlib
import json
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
loki_certificate = self.buildout['loki-promtail-client-certificate']
self._config_files = {} # files to create during install step
access_path_dict = {}
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
"http_listen_port": int(self.buildout['promtail']['http-port']),
"grpc_listen_address": self.buildout['promtail']['ip'],
"grpc_listen_port": int(self.buildout['promtail']['grpc-port']),
"graceful_shutdown_timeout": 5,
"external_url": self.buildout['promtail']['url'],
},
"positions": {
"filename": "{}/positions.yaml".format(self.buildout['promtail']['dir']),
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": loki_certificate['ca-file'],
"cert_file": loki_certificate['cert-file'],
"key_file": loki_certificate['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
"scrape_configs": []
}
def get_job_selector(partition, job_name, application_name):
# make a selector in LogQL, like '{job="job_name",key="value"}'
selector_parts = [f'app="{application_name}"']
for k, v in dict(partition.get('static-tags', {}), job=job_name).items():
selector_parts.append(f'{k}="{v}"')
return "{%s}" % ",".join(selector_parts)
def get_static_configs(partition, job_name, path_list, application):
if not isinstance(path_list, list):
raise ValueError(f'{path_list!r} is not a list')
directory = ''
if partition.get('reference') and 'instance-root' in application:
instance_root = pathlib.Path(application['instance-root'])
directory = instance_root / partition['reference']
path_list = [path.format(directory=directory) for path in path_list]
for path in path_list:
access_path_dict[path] = 'r'
partition_kw = {}
if partition.get('reference'):
partition_kw['partition_reference'] = partition['reference']
return [
{
"targets": [
"localhost"
],
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
app=application['name'],
name=partition['name'],
computer_id=slap_connection['computer-id'],
__path__=path,
**partition_kw
)
} for path in path_list
]
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
if partition['type'] in ('erp5/zope-activity', 'erp5/zope-front'):
# job name include the app name because they need to be unique
job_name = f"{application['name']}-{partition['name']}-event-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# TODO this does not seem to work well
"firstline": "^------",
"max_wait_time": "5s"
}
},
{
"regex": {
# TODO don't include the ----
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zope-*-event.log"],
application,
)})
if partition['type'] == 'erp5/zope-front':
job_name = f"{application['name']}-{partition['name']}-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
# drop requests for haproxy health check
"pipeline_stages": [
{
"drop": {
"expression": '.* "GET / HTTP/1.0" 200 .*'
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zope-*-Z2.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-long-request-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^(?P<timestamp>.*) .*"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/longrequest_logger_zope-*.log"],
application,
)})
if partition['type'] in ('erp5/mariadb', 'mariadb'):
job_name = f"{application['name']}-{partition['name']}-mariadb-slow-queries"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": ".*SET timestamp=(?P<timestamp>\\d+);.*"
}
},
{
"timestamp": {
"format": "Unix",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/mariadb_slowquery.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-mariadb-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"timestamp": {
"format": "2021-06-05 3:55:31",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/mariadb_error.log"],
application,
)})
if partition['type'] == 'erp5/zeo':
job_name = f"{application['name']}-{partition['name']}-zeo-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None,
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zeo-*.log"],
application,
)})
if partition['type'] == 'erp5/balancer':
job_name = f"{application['name']}-{partition['name']}-balancer-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/apache-access.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-balancer-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/apache-error.log"],
application,
)})
if partition.get('log-file-patterns'):
job_name = f"{application['name']}-{partition['name']}"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
partition['log-file-patterns'],
application,
)})
self._config_files[options['location']] = json.dumps(cfg, indent=2)
options['access-path-dict'] = access_path_dict
install =
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
{% endraw %}
[promtail-listen-promise]
<= check-port-listening-promise
hostname = ${promtail:ip}
port = ${promtail:http-port}
[telegraf-listen-promise]
recipe = slapos.cookbook:wrapper
command-line =
test -S ${telegraf-config-file:input-socket}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[facl-script]
recipe = slapos.recipe.build
promtail-access-path-dict = ${promtail-config-file:access-path-dict}
telegraf-access-path-dict = ${telegraf-config-file:access-path-dict}
install =
import itertools
import os
import pathlib
import pwd
import shlex
user = pwd.getpwuid(os.getuid()).pw_name
script_code = ''
def quote_path(p):
# quote, but preserve *
p = str(p)
assert '__STAR__' not in p
p = p.replace('*', '__STAR__')
p = shlex.quote(p)
p = p.replace('__STAR__', '*')
return p
# make sure we can access the parents folders
parent_access = {}
def check_parent_access(path):
parent = path.parent
if parent != path:
parent_access[str(parent)] = 'x'
check_parent_access(parent)
for path_spec, access in itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items()):
path = pathlib.Path(path_spec)
check_parent_access(path)
for path_spec, access in sorted(itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items(),
parent_access.items())):
path = pathlib.Path(path_spec)
if '*' in path_spec:
script_code += f'setfacl --modify=u:{user}:rx {quote_path(path.parent)}\n'
script_code += f'setfacl --modify=u:{user}:{access} {quote_path(path)}\n'
pathlib.Path(location).write_text(script_code)
[promises]
recipe =
instance-promises =
${promtail-listen-promise:path}
${telegraf-listen-promise:wrapper-path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
facl-script = ${facl-script:location}
promtail-url = ${promtail:url}
......@@ -3,6 +3,11 @@
"description": "Parameters to instantiate Grafana",
"type": "object",
"additionalProperties": false,
"properties": {
"email": {
"type": "object",
"description": "Email configuration",
"additionalProperties": false,
"properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
......@@ -17,23 +22,49 @@
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "boolean"
"description": "Verify certificate of SMTP server",
"type": "boolean",
"default": true
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"description": "Email address used in `From:` header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"description": "Name used in `From:` header of emails",
"default": "Grafana",
"type": "string"
}
}
},
"frontend": {
"type": "object",
"additionalProperties": false,
"properties": {
"custom-domain": {
"description": "Custom domain to use when requesting a rapid-cdn frontend",
"type": "string",
"format": "hostname"
}
}
},
"caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates",
"caucase": {
"type": "object",
"description": "Caucase configuration. To connect external agents, it's required to approve their client certificates, either using an external caucase referenced as `external-caucase-url` or registering a user with `user-auto-approve-count`",
"additionalProperties": false,
"properties": {
"external-caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates, to use instead of embedding caucase",
"type": "string",
"format": "uri"
},
"user-auto-approve-count": {
"description": "Number of users to automatically approve in the embedded caucase",
"type": "integer",
"default": 0
}
}
},
"influxdb": {
"description": "Fine tuning influxdb parameters",
"type": "object",
......
{
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by Grafana instantiation",
"additionalProperties": false,
"properties": {
"url": {
"description": "Shared frontend for this Grafana instance",
......@@ -47,6 +46,15 @@
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
},
"agent-promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"agent-facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
}
},
"type": "object"
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
grafana-dashboards-dir = ${:grafana-dir}/dashboards
loki-dir = ${:srv}/loki
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
# macros
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION {{ slapparameter_dict.get('influxdb', {}).get('default-retention-policy-days', 720) }}d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
provisioning-dashboards-dir = ${directory:grafana-provisioning-dashboards-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-files =
${grafana-config-file:output}
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-insecure-self-signed-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": options['influxdb-database'],
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
rendered = ${grafana:provisioning-dashboards-dir}/dashboard.yaml
context =
key dashboards_dir directory:grafana-dashboards-dir
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-server-listen-promise]
<= check-url-available-promise
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
{% if slapparameter_dict.get('caucase', {}).get('external-caucase-url') %}
[loki-caucased]
url = {{ slapparameter_dict.get('caucase', {}).get('external-caucase-url') }}
{% else %}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count is 4 for the default:
# - server: loki
# - clients: loki promise, grafana, promtail
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout_bin_directory,
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=4,
user_auto_approve_count='${loki-caucased-user-auto-approve-count:user-auto-approve-count}',
key_len=2048,
)}}
[loki-caucased-user-auto-approve-count]
user-auto-approve-count = {{ slapparameter_dict.get('caucase', {}).get('user-auto-approve-count', 0) }}
{% endif %}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
{% if slapparameter_dict.get('frontend', {}).get('custom-domain') %}
config-custom_domain = {{ slapparameter_dict['frontend']['custom-domain'] }}
{% endif %}
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[request-agent-config]
recipe = slapos.recipe.build
init =
slap_connection = self.buildout["slap-connection"]
configuration = self.buildout['slap-configuration']['configuration']
applications = configuration.get('agent', {}).get('applications', [])
applications.append(
# Add a default config ingesting grafana's and influxdb's logs
{
"name": "Grafana",
"type": "system",
"partitions": [
{
"name": "grafana",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
]
},
{
"name": "influxdb",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_grafana*.log",
]
},
]
}
)
options['applications'] = applications
options['loki'] = {
'url': self.buildout['loki-server']['url'],
'caucase-url': self.buildout['loki-caucased']['url'],
}
options['influxdb'] = {
"url": self.buildout['influxdb']['url'],
"database": self.buildout['influxdb']['database'],
"username": self.buildout['influxdb']['auth-username'],
"password": self.buildout['influxdb']['auth-password'],
}
[request-slapos-partition-base]
recipe = slapos.cookbook:request.serialised
software-url = ${slap-connection:software-release-url}
server-url = ${slap-connection:server-url}
key-file = ${slap-connection:key-file}
cert-file = ${slap-connection:cert-file}
computer-id = ${slap-connection:computer-id}
partition-id = ${slap-connection:partition-id}
[request-agent]
<= request-slapos-partition-base
software-type = agent
name = agent
return = facl-script promtail-url
config-applications = ${request-agent-config:applications}
config-loki = ${request-agent-config:loki}
config-influxdb = ${request-agent-config:influxdb}
[agent-promtail-url]
recipe = slapos.cookbook:urlparse
url = ${request-agent:connection-promtail-url}
[agent-promtail-listen-promise]
<= check-port-listening-promise
hostname = ${agent-promtail-url:host}
port = ${agent-promtail-url:port}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${apache-frontend-available-promise:path}
${agent-promtail-listen-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
url = ${apache-frontend:connection-secure_access}
agent-facl-script = ${request-agent:connection-facl-script}
agent-promtail-url = ${request-agent:connection-promtail-url}
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
parts = switch-softwaretype
eggs-directory = {{ buildout['eggs-directory'] }}
develop-eggs-directory = {{ buildout['develop-eggs-directory'] }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[jinja2-template-base]
recipe = slapos.recipe.template:jinja2
filename = ${:_buildout_section_name_}.cfg
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:filename}
extensions =
jinja2.ext.do
extra-context =
context =
raw buildout_bin_directory {{ buildout['bin-directory'] }}
raw buildout_parts_directory {{ buildout['parts-directory'] }}
raw buildout_eggs_directory {{ buildout['eggs-directory'] }}
raw buildout_develop_eggs_directory {{ buildout['develop-eggs-directory'] }}
key slapparameter_dict slap-configuration:configuration
raw instance_default {{ instance_default }}
raw instance_agent {{ instance_agent }}
raw openssl_bin {{ openssl_bin }}
raw telegraf_bin {{ telegraf_bin }}
raw telegraf_input_slapos_bin {{ telegraf_input_slapos_bin }}
raw influxd_bin {{ influxd_bin }}
raw influx_bin {{ influx_bin }}
raw grafana_bin {{ grafana_bin }}
raw grafana_homepath {{ grafana_homepath }}
raw loki_bin {{ loki_bin }}
raw promtail_bin {{ promtail_bin }}
raw curl_bin {{ curl_bin }}
raw dash_bin {{ dash_bin }}
raw jq_bin {{ jq_bin }}
import-list =
file caucase context:caucase-jinja2-library
[context]
caucase-jinja2-library = {{ caucase_jinja2_library }}
[instance-default]
<= jinja2-template-base
url = {{ instance_default }}
[instance-agent]
<= jinja2-template-base
url = {{ instance_agent }}
[switch-softwaretype]
recipe = slapos.cookbook:switch-softwaretype
default = instance-default:output
RootSoftwareInstance = ${:default}
agent = instance-agent:output
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
grafana-dashboards-dir = ${:grafana-dir}/dashboards
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
loki-dir = ${:srv}/loki
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout['parts-directory'] }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
# TODO: actually use parameter
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION 720d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
provisioning-dashboards-dir = ${directory:grafana-provisioning-dashboards-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-insecure-self-signed-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": "telegraf",
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
rendered = ${grafana:provisioning-dashboards-dir}/dashboard.yaml
context =
key dashboards_dir directory:grafana-dashboards-dir
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && nice -19 chrt --idle 0 ionice -c3 {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
[telegraf-config-file]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
init =
import zc.buildout
import pkg_resources
buildout_options = self.buildout["buildout"]
zc.buildout.easy_install.install(
["toml"],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options["develop-eggs-directory"],
buildout_options["eggs-directory"],
],
)
import collections
import os.path
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb']
# files to create during install step
self._config_files = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
# v TODO remove agent
for application in slapparameter_dict.get("agent", {}).get("applications", []):
partition_mapping = {}
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
if "reference" in partition:
partition_mapping[partition["reference"]] = partition["name"]
partition_directory = os.path.join(application["instance-root"], partition['reference'])
if partition["type"] in ("erp5/mariadb", "mariadb"):
partition.setdefault("username", "root")
partition.setdefault("dbname", "erp5")
dsn = f"{partition['username']}@unix({partition_directory}/var/run/mariadb.sock)/{partition['dbname']}"
inputs["mysql"].append(
{
"name_override": f"{partition['name']}-mysql",
"servers": [dsn],
"gather_innodb_metrics": True,
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
inputs["sql"].append(
{
"name_override": f"{partition['name']}-activities",
"driver": "mysql",
"dsn": dsn,
"query": [
{
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/balancer":
inputs["tail"].append(
{
"data_format": "grok",
"files": [f"{partition_directory}/var/log/apache-access.log"],
"grok_custom_pattern_files": [],
"grok_custom_patterns": "",
"grok_patterns": [
'%{IPORHOST:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \\[%{HTTPDATE:timestamp}\\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) %{QS:referrer} %{QS:agent} %{NUMBER:response_time:int}'
],
"grok_timezone": "Local",
"name_override": f"{partition['name']}",
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
urls = application.get("urls", [])
if urls:
inputs["http_response"].append({
"interval": "5m",
"urls": urls,
"tags": {"app": application["name"]},
})
for url in urls:
x509_url = url
parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme == 'https':
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
# TODO some kind of GET request every X minutes ?
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = os.path.join(
self.options['location'],
f"telegraf-input-slapos-{application['name']}.cfg")
self._config_files[telegraf_slapos_input_config_file] = toml.dumps({
"inputs": {
"slapos": [{
"instance_root": application["instance-root"]}]}})
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": f"{application['name']}-processes",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": [f"{application['name']}-processes"],
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 3,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 4,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": [ f"{application['name']}-processes"],
"order": 6,
"mapping": [{
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
self._config_files[options['output']] = toml.dumps(config)
install =
import os
os.mkdir(self.options['location'])
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-server-listen-promise]
<= check-url-available-promise
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
# agent + server
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
# req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
# [ req_ext ]
# subjectAltName = @alt_names
# [ alt_names ]
# IP.1 = ${loki-server:ipv4}
# IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
# agent
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count needs:
# server: loki
# clients: loki promise, grafana, promtail
# TODO: this is bad default
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout['bin-directory'],
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=5,
user_auto_approve_count=1,
key_len=2048,
)}}
[promtail]
recipe = slapos.cookbook:wrapper
command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ promtail_bin }} -config.file=${promtail-config-file:location}'
wrapper-path = ${directory:service}/promtail
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.cfg
slapparameter-dict = ${slap-configuration:configuration}
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
install =
import os
# XXX make extra eggs available to buildout
import zc.buildout
import pkg_resources
buildout_options = self.buildout['buildout']
zc.buildout.easy_install.install(
['pyyaml'],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options['develop-eggs-directory'],
buildout_options['eggs-directory']])
import yaml
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
"http_listen_port": int(self.buildout['promtail']['http-port']),
"grpc_listen_address": self.buildout['promtail']['ip'],
"grpc_listen_port": int(self.buildout['promtail']['grpc-port']),
"graceful_shutdown_timeout": 5,
"external_url": self.buildout['promtail']['url'],
},
"positions": {
"filename": "{}/positions.yaml".format(self.buildout['promtail']['dir']),
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": self.buildout['loki-server']['ca-file'],
"cert_file": self.buildout['loki-promtail-client-certificate']['cert-file'],
"key_file": self.buildout['loki-promtail-client-certificate']['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
"scrape_configs": []
}
def get_job_selector(partition, job_name, application_name):
# make a selector in LogQL, like '{job="job_name",key="value"}'
selector_parts = [f'app="{application_name}"']
for k, v in dict(partition.get('static-tags', {}), job=job_name).items():
selector_parts.append(f'{k}="{v}"')
return "{%s}" % ",".join(selector_parts)
def get_static_configs(partition, job_name, path, application):
directory = ''
if partition.get('reference') and 'instance-root' in path:
directory = os.path.join(application['instance-root'], partition['reference'])
return [
{
"targets": [
"localhost"
],
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
app=application['name'],
name=partition['name'],
partition_reference=partition['reference'],
computer_id=slap_connection['computer-id'],
__path__=path.format(directory=directory),
)
}
]
# Add grafana and influxdb own logs. TODO: not in agent mode
cfg['scrape_configs'].append(
{
"job_name": "Grafana",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Grafana", "reference": slap_connection['partition-id']},
"Grafana",
f"{self.buildout['directory']['home']}/.*_grafana*.log",
{"name": "Grafana"}
)
}
)
cfg['scrape_configs'].append(
{
"job_name": "Influxdb",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Influxdb", "reference": slap_connection['partition-id']},
"Influxdb",
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
{"name": "Grafana"},
)
}
)
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
if partition['type'] in ('erp5/zope-activity', 'erp5/zope-front'):
job_name = f"{partition['name']}-event-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
# TODO don't include the ----
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-event.log",
application,
)})
if partition['type'] == 'erp5/zope-front':
job_name = f"{partition['name']}-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
# drop requests for haproxy health check
"pipeline_stages": [
{
"drop": {
"expression": '.* "GET / HTTP/1.0" 200 .*'
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-Z2.log",
application,
)})
job_name = f"{partition['name']}-long-request-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^(?P<timestamp>.*) .*"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-longrequest.log",
application,
)})
if partition['type'] in ('erp5/mariadb', 'mariadb'):
job_name = f"{partition['name']}-mariadb-slow-queries"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": ".*SET timestamp=(?P<timestamp>\\d+);.*"
}
},
{
"timestamp": {
"format": "Unix",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/mariadb_slowquery.log",
application,
)})
job_name = f"{partition['name']}-mariadb-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"timestamp": {
"format": "2021-06-05 3:55:31",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/mariadb_error.log",
application,
)})
if partition['type'] == 'erp5/zeo':
job_name = f"{partition['name']}-zeo-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None,
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zeo-*.log",
application,
)})
if partition['type'] == 'erp5/balancer':
job_name = f"{partition['name']}-balancer-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/apache-access.log",
application,
)})
job_name = f"{partition['name']}-balancer-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/apache-error.log",
application,
)})
if partition.get('log-file-patterns'):
job_name = partition['name']
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
f"{partition['log-file-patterns']}",
application,
)})
with open(self.options['location'], 'w') as f:
yaml.dump(cfg, f)
{% endraw %}
[promtail-listen-promise]
<= check-port-listening-promise
hostname= ${promtail:ip}
port = ${promtail:http-port}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${promtail-listen-promise:path}
${apache-frontend-available-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
promtail-url = ${promtail:url}
url = ${apache-frontend:connection-secure_access}
......@@ -129,15 +129,18 @@ url = ${:_profile_base_location_}/${:filename}
[grafana-provisioning-dashboards-config-file]
<= download-file-base
[loki-config-file]
<= download-file-base
[instance-eggs]
recipe = zc.recipe.egg
eggs =
${python-PyYAML:egg}
toml
[instance-agent]
<= download-file-base
[instance-default]
<= download-file-base
[instance-profile]
recipe = slapos.recipe.template:jinja2
url = ${:_profile_base_location_}/${:filename}
......@@ -145,6 +148,8 @@ output = ${buildout:directory}/instance.cfg
extensions = jinja2.ext.do
context =
section buildout buildout
key instance_default instance-default:target
key instance_agent instance-agent:target
key openssl_bin openssl-output:openssl
key telegraf_bin gowork:telegraf-bin
key telegraf_input_slapos_bin gowork:telegraf-input-slapos-bin
......@@ -157,13 +162,12 @@ context =
key curl_bin :curl-bin
key dash_bin :dash-bin
key jq_bin :jq-bin
key caucase_jinja2_library caucase-jinja2-library:target
curl-bin = ${curl:location}/bin/curl
dash-bin = ${dash:location}/bin/dash
jq-bin = ${jq:location}/bin/jq
depends = ${instance-eggs:eggs} ${caucase-eggs:eggs}
import-list =
file caucase caucase-jinja2-library:target
[versions]
inotifyx = 0.2.2
toml = 0.10.2
......@@ -53,8 +53,7 @@ class GrafanaTestCase(SlapOSInstanceTestCase):
Since the instances takes time to start and stop,
we increase the number of retries.
"""
# instance_max_retry = 50
instance_max_retry = 30 # TODO
instance_max_retry = 50
report_max_retry = 30
......@@ -117,6 +116,8 @@ class TestGrafana(GrafanaTestCase):
if loki_health.get('data'):
break
time.sleep(retry)
else:
self.fail(loki_health)
self.assertEqual(loki_health['status'], "success")
self.assertIn("app", loki_health['data'])
......@@ -132,19 +133,20 @@ class TestGrafana(GrafanaTestCase):
class TestGrafanaEmailEnabled(GrafanaTestCase):
__partition_reference__ = 'mail'
smtp_verify_ssl = "true"
smtp_verify_ssl = True
smtp_skip_verify = "false"
@classmethod
def getInstanceParameterDict(cls):
return json.dumps({"_": {
return {"_": json.dumps({
"email": {
"smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username",
"smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name",
}})
}})}
def test_email_enabled(self):
config = configparser.ConfigParser()
......@@ -163,7 +165,7 @@ class TestGrafanaEmailEnabled(GrafanaTestCase):
class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled):
smtp_verify_ssl = "false"
smtp_verify_ssl = False
smtp_skip_verify = "true"
......@@ -212,9 +214,14 @@ class TestTelegraf(GrafanaTestCase):
"instance-root": cls.slap._instance_root,
"partitions": [
{
"name": "test grafana - partition name",
"name": "test grafana - default partition",
"type": "default",
"reference": "G0"
"reference": "G0", # XXX assumes partitions will be allocated in order
},
{
"name": "test grafana - agent partition",
"type": "default",
"reference": "G1"
},
],
},
......@@ -223,6 +230,10 @@ class TestTelegraf(GrafanaTestCase):
}
return {'_': json.dumps(parameter_dict)}
def setUp(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
def test_telegraf_running(self):
with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo()
......@@ -230,9 +241,6 @@ class TestTelegraf(GrafanaTestCase):
self.assertEqual(process_info['statename'], 'RUNNING')
def test_telegraf_ingest_slapos_metrics(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
# wait for data to be ingested
time.sleep(16)
......@@ -264,27 +272,27 @@ class TestTelegraf(GrafanaTestCase):
if resp.ok and resp.json()['results'][0].get('series'):
break
time.sleep(i)
else:
self.fail(resp.text)
series = resp.json()['results'][0].get('series')
print(series)
breakpoint()
# hashes and "-on-watch" is removed from process_name
self.asserIn('grafana', [s['tags']['process_name'] for s in series])
self.asserIn('telegraf', [s['tags']['process_name'] for s in series])
self.asserIn('loki-service', [s['tags']['process_name'] for s in series])
self.asserIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
self.assertIn('grafana', [s['tags']['process_name'] for s in series])
self.assertIn('telegraf', [s['tags']['process_name'] for s in series])
self.assertIn('loki-service', [s['tags']['process_name'] for s in series])
self.assertIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
tags = [s['tags'] for s in series][0]
self.assertEqual(tags['name'], 'test grafana - partition name')
tags = [s['tags'] for s in series if s['tags']['partition_reference'] == 'G0'][0]
self.assertEqual(tags['name'], 'test grafana - default partition')
self.assertEqual(tags['computer_id'], self.slap._computer_id)
self.assertEqual(tags['partition_reference'], 'G0')
self.assertEqual(
set([s['tags']['partition_reference'] for s in series]),
{'G0'},
{s['tags']['partition_reference'] for s in series},
{'G0', 'G1'},
)
self.fail('TODO')
class TestLoki(GrafanaTestCase):
@classmethod
......@@ -296,12 +304,10 @@ class TestLoki(GrafanaTestCase):
"applications": [
{
"name": "TestLoki",
# "instance-root": "/", # XXX needed ?
"partitions": [
{
# no slapos for system application
"name": "test log file",
"log-file-patterns": cls._logfile.name,
"log-file-patterns": [cls._logfile.name],
"static-tags": {
"testtag": "foo",
},
......@@ -352,14 +358,17 @@ class TestLoki(GrafanaTestCase):
if result := resp.json().get('data', {}).get('result', []):
break
time.sleep(i)
else:
self.fail(resp.text)
self.assertEqual(
result[0]['stream'],
{
'app': 'TestLoki',
'computer_id': self.slap._computer_id,
'detected_level': 'info',
'filename': self._logfile.name,
'job': 'test log file',
'partition': 'test log file',
'job': 'TestLoki-test log file',
'name': 'test log file',
'service_name': 'TestLoki',
'testtag': 'foo',
}
......@@ -404,13 +413,13 @@ class TestListenInPartition(GrafanaTestCase):
c.laddr for c in self.process_dict['influxdb'].connections()
if c.status == 'LISTEN'
]),
[
sorted([
(self._ipv4_address, 8088),
(self.computer_partition_ipv6_address, 8086),
],
]),
)
def test_telegraph_listen(self):
def test_telegraf_listen(self):
self.assertEqual(
[
c.laddr for c in self.process_dict['telegraf'].connections()
......@@ -425,10 +434,10 @@ class TestListenInPartition(GrafanaTestCase):
c.laddr for c in self.process_dict['loki-service'].connections()
if c.status == 'LISTEN'
]),
[
(self.computer_partition_ipv6_address, 3100),
sorted([
(self._ipv4_address, 9095),
],
(self.computer_partition_ipv6_address, 3100),
]),
)
def test_promtail_listen(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment