Commit c9867831 authored by Jérome Perrin's avatar Jérome Perrin

grafana split

parent a9cebcfd
...@@ -15,7 +15,16 @@ ...@@ -15,7 +15,16 @@
[instance-profile] [instance-profile]
filename = instance.cfg.in filename = instance.cfg.in
md5sum = e4d5ac3e6ad239d3bf48c2b3172919b5 md5sum = 32c772c593d2c3c38c26186b91b78cf8
[instance-default]
filename = instance-default.cfg.in
md5sum = 5a9650b8654b2aeaeab076b08b248b70
[instance-agent]
filename = instance-agent.cfg.in
md5sum = edb4eeb900ad13a3b3a6e174c1ea533b
[influxdb-config-file] [influxdb-config-file]
filename = influxdb-config-file.cfg.in filename = influxdb-config-file.cfg.in
...@@ -23,9 +32,10 @@ md5sum = a28972ced3e0f4aa776e43a9c44717c0 ...@@ -23,9 +32,10 @@ md5sum = a28972ced3e0f4aa776e43a9c44717c0
[grafana-config-file] [grafana-config-file]
filename = grafana-config-file.cfg.in filename = grafana-config-file.cfg.in
md5sum = 83a8445858eab21a12f1769c23424bea md5sum = 2b75d6b1984d9d154303ec773aa88474
[grafana-provisioning-dashboards-config-file] [grafana-provisioning-dashboards-config-file]
filename = grafana-provisioning-dashboards-config-file.cfg.in filename = grafana-provisioning-dashboards-config-file.cfg.in
md5sum = 5616679a9c5c2757540175ead3f5500a md5sum = 5616679a9c5c2757540175ead3f5500a
...@@ -334,20 +334,21 @@ allow_sign_up = true ...@@ -334,20 +334,21 @@ allow_sign_up = true
#################################### SMTP / Emailing ##################### #################################### SMTP / Emailing #####################
[smtp] [smtp]
{% set email = slapparameter_dict.get('email', {}) %}
#enabled = false #enabled = false
enabled = {{ slapparameter_dict.get('smtp-server') and 'true' or 'false' }} enabled = {{ email.get('smtp-server') and 'true' or 'false' }}
#host = locahost:25 #host = locahost:25
host = {{ slapparameter_dict.get('smtp-server', '') }} host = {{ email.get('smtp-server', '') }}
#user = #user =
user = {{ slapparameter_dict.get('smtp-username', '') }} user = {{ email.get('smtp-username', '') }}
# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;""" # If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;"""
#password = #password =
password = {{ slapparameter_dict.get('smtp-password', '') and '"""%s"""' % slapparameter_dict['smtp-password'] or ""}} password = {{ email.get('smtp-password', '') and '"""%s"""' % email['smtp-password'] or ""}}
cert_file = cert_file =
key_file = key_file =
skip_verify = {{ slapparameter_dict.get('smtp-verify-ssl') and 'true' or 'false' }} skip_verify = {{ email.get('smtp-verify-ssl') and 'false' or 'true' }}
from_address = {{ slapparameter_dict.get('email-from-address', '') }} from_address = {{ email.get('email-from-address', '') }}
from_name = {{ slapparameter_dict.get('email-from-name', 'Grafana') }} from_name = {{ email.get('email-from-name', 'Grafana') }}
ehlo_identity = ehlo_identity =
[emails] [emails]
......
{ {
"$schema": "http://json-schema.org/draft-04/schema", "$schema": "https://json-schema.org/draft/2019-09/schema",
"description": "Parameters to instantiate an agent collecting logs and metrics", "description": "Parameters to instantiate an agent collecting logs and metrics",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,
"unevaluatedProperties": false,
"$defs": { "$defs": {
"type": { "type": {
"description": "Type of the application. With `SlapOS` type, some metrics are collected from supervisor and from some known partition types (for example: ERP5's mariadb or ERP5's zopes). With `system` type, only log files are ingested.", "description": "Type of the application. With `SlapOS` type, some metrics are collected from supervisor and from some known partition types (for example: ERP5's mariadb or ERP5's zopes). With `system` type, only log files are ingested.",
...@@ -36,7 +37,7 @@ ...@@ -36,7 +37,7 @@
"description": "Static tags for this partition", "description": "Static tags for this partition",
"examples": [ "examples": [
{ {
"region": "eu", "service-level": "production",
"data-center": "abc123" "data-center": "abc123"
} }
] ]
...@@ -76,7 +77,11 @@ ...@@ -76,7 +77,11 @@
}, },
"instance-root": { "instance-root": {
"description": "Directory containing SlapOS partitions.", "description": "Directory containing SlapOS partitions.",
"type": "string" "type": "string",
"examples": [
"/srv/slapgrid/",
"/srv/slapgrid/slappart30/srv/runner/instance/"
]
}, },
"partitions": { "partitions": {
"description": "SlapOS partitions to monitor", "description": "SlapOS partitions to monitor",
...@@ -87,7 +92,7 @@ ...@@ -87,7 +92,7 @@
"name", "name",
"reference" "reference"
], ],
"additionalProperties": false, "unevaluatedProperties": false,
"properties": { "properties": {
"name": { "name": {
"type": "string", "type": "string",
...@@ -120,13 +125,39 @@ ...@@ -120,13 +125,39 @@
"default": "default" "default": "default"
}, },
"log-file-patterns": { "log-file-patterns": {
"$refs": "#/$defs/log-file-patterns", "$ref": "#/$defs/log-file-patterns",
"description": "Glob pattern for log files to watch. This mostly makes sense for `default` partition type" "description": "Glob pattern for log files to watch. This mostly makes sense for `default` partition type"
}, },
"static-tags": { "static-tags": {
"$refs": "#/$defs/static-tags" "$ref": "#/$defs/static-tags"
}
},
"allOf": [
{
"if": {
"properties": {
"type": {
"enum": [
"mariadb",
"erp5/mariadb"
]
}
} }
}, },
"then": {
"properties": {
"dbname": {
"type": "string",
"description": "Database name"
},
"username": {
"type": "string",
"description": "Username to connect to database"
}
}
}
}
],
"examples": [ "examples": [
{ {
"name": "zope-backoffice", "name": "zope-backoffice",
...@@ -187,10 +218,10 @@ ...@@ -187,10 +218,10 @@
] ]
}, },
"log-file-patterns": { "log-file-patterns": {
"$refs": "#/$defs/log-file-patterns" "$ref": "#/$defs/log-file-patterns"
}, },
"static-tags": { "static-tags": {
"$refs": "#/$defs/static-tags" "$ref": "#/$defs/static-tags"
} }
}, },
"examples": [ "examples": [
......
...@@ -6,6 +6,15 @@ ...@@ -6,6 +6,15 @@
"telegraf-extra-config-dir": { "telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern", "description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"type": "string" "type": "string"
},
"promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
} }
}, },
"type": "object" "type": "object"
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
import urllib.parse
influxdb = options['slapparameter-dict']['influxdb']
options['url'] = influxdb['url']
options['database'] = influxdb['database']
options['auth-username'] = influxdb['username']
options['auth-password'] = influxdb['password']
parsed_url = urllib.parse.urlparse(options['url'])
options['hostname'] = parsed_url.hostname
options['port'] = str(parsed_url.port)
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb-server:hostname}
port = ${influxdb-server:port}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && ${:nice} {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
hash-files = ${telegraf-config-file:output}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
[telegraf-config-file]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
input-socket = ${directory:var}/tg.sock
init =
import zc.buildout
import pkg_resources
buildout_options = self.buildout["buildout"]
zc.buildout.easy_install.install(
["toml"],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options["develop-eggs-directory"],
buildout_options["eggs-directory"]])
import collections
import pathlib
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb-server']
self._config_files = {} # files to create during install step
access_path_dict = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
for application in slapparameter_dict.get("applications", []):
partition_mapping = {}
partition_directory = ''
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
if "reference" in partition:
partition_mapping[partition["reference"]] = partition["name"]
if application.get("instance-root"):
partition_directory = pathlib.Path(application["instance-root"]) / partition['reference']
if partition["type"] in ("erp5/mariadb", "mariadb"):
partition.setdefault("username", "root")
partition.setdefault("dbname", "erp5")
mariadb_socket = f"{partition_directory}/var/run/mariadb.sock"
dsn = f"{partition['username']}@unix({mariadb_socket})/{partition['dbname']}"
access_path_dict[mariadb_socket] = 'rw'
inputs["mysql"].append(
{
"servers": [dsn],
"gather_innodb_metrics": True,
"gather_slave_status": True,
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
inputs["sql"].append(
{
"name_override": "mariadb-activities",
"driver": "mysql",
"dsn": dsn,
"query": [
{
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/balancer":
# XXX do we really want this one ?
access_log = f"{partition_directory}/var/log/apache-access.log"
access_path_dict[access_log] = 'r'
inputs["tail"].append(
{
"data_format": "grok",
"files": [access_log],
"grok_custom_pattern_files": [],
"grok_custom_patterns": "",
"grok_patterns": [
'%{IPORHOST:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \\[%{HTTPDATE:timestamp}\\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) %{QS:referrer} %{QS:agent} %{NUMBER:response_time:int}'
],
"grok_timezone": "Local",
"name_override": "haproxy_logs"
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
inputs["haproxy"].append(
{
"servers": [f"{partition_directory}/var/run/ha.sock"],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
})
urls = application.get("urls", [])
if urls:
inputs["http_response"].append({
"interval": "5m",
"urls": urls,
"tags": {"app": application["name"]},
})
for url in urls:
x509_url = url
parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme == 'https':
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = str(
pathlib.Path(self.options['location'])
/ f"telegraf-input-slapos-{application['name']}.cfg"
)
self._config_files[telegraf_slapos_input_config_file] = toml.dumps({
"inputs": {
"slapos": [{
"instance_root": application["instance-root"]}]}})
access_path_dict[f"{application['instance-root']}/sv.sock"] = 'rw'
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": "slapos_services",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": ["slapos_services"],
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": ["slapos_services"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": ["slapos_services"],
"order": 3,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": ["slapos_services"],
"order": 4,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": ["slapos_services"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": ["slapos_services"],
"order": 6,
"mapping": [{
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
# add a socket input so that we can have a promise verifying that telegraf is listening
inputs['socket_listener'].append({"service_address": f"unix://{self.options['input-socket']}"})
options['access-path-dict'] = access_path_dict
self._config_files[options['output']] = toml.dumps(config)
install =
import os
os.mkdir(self.options['location'])
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
[loki-server]
recipe = slapos.recipe.build
slapparameter-dict = ${slap-configuration:configuration}
init =
loki = options['slapparameter-dict']['loki']
options['url'] = loki['url']
options['caucase-url'] = loki['caucase-url']
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:cn}
L = ${slap-connection:computer-id}
O = ${slap-connection:partition-id}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
cn = loki ${slap-connection:partition-id}@${slap-connection:computer-id}
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-server:caucase-url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[promtail]
recipe = slapos.cookbook:wrapper
command-line = ${:nice} {{ promtail_bin }} -config.file=${promtail-config-file:location}
wrapper-path = ${directory:service}/promtail
hash-files =
${promtail-config-file:location}
# TODO: control nice of the agent ?
{% if 0 %}
nice = nice -19 chrt --idle 0 ionice -c3
{% else %}
nice =
{% endif %}
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.yaml
slapparameter-dict = ${slap-configuration:configuration}
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
init =
import pathlib
import json
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
loki_certificate = self.buildout['loki-promtail-client-certificate']
self._config_files = {} # files to create during install step
access_path_dict = {}
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
"http_listen_port": int(self.buildout['promtail']['http-port']),
"grpc_listen_address": self.buildout['promtail']['ip'],
"grpc_listen_port": int(self.buildout['promtail']['grpc-port']),
"graceful_shutdown_timeout": 5,
"external_url": self.buildout['promtail']['url'],
},
"positions": {
"filename": "{}/positions.yaml".format(self.buildout['promtail']['dir']),
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": loki_certificate['ca-file'],
"cert_file": loki_certificate['cert-file'],
"key_file": loki_certificate['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
"scrape_configs": []
}
def get_job_selector(partition, job_name, application_name):
# make a selector in LogQL, like '{job="job_name",key="value"}'
selector_parts = [f'app="{application_name}"']
for k, v in dict(partition.get('static-tags', {}), job=job_name).items():
selector_parts.append(f'{k}="{v}"')
return "{%s}" % ",".join(selector_parts)
def get_static_configs(partition, job_name, path_list, application):
if not isinstance(path_list, list):
raise ValueError(f'{path_list!r} is not a list')
directory = ''
if partition.get('reference') and 'instance-root' in application:
instance_root = pathlib.Path(application['instance-root'])
directory = instance_root / partition['reference']
path_list = [path.format(directory=directory) for path in path_list]
for path in path_list:
access_path_dict[path] = 'r'
partition_kw = {}
if partition.get('reference'):
partition_kw['partition_reference'] = partition['reference']
return [
{
"targets": [
"localhost"
],
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
app=application['name'],
name=partition['name'],
computer_id=slap_connection['computer-id'],
__path__=path,
**partition_kw
)
} for path in path_list
]
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
if partition['type'] in ('erp5/zope-activity', 'erp5/zope-front'):
# job name include the app name because they need to be unique
job_name = f"{application['name']}-{partition['name']}-event-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# TODO this does not seem to work well
"firstline": "^------",
"max_wait_time": "5s"
}
},
{
"regex": {
# TODO don't include the ----
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zope-*-event.log"],
application,
)})
if partition['type'] == 'erp5/zope-front':
job_name = f"{application['name']}-{partition['name']}-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
# drop requests for haproxy health check
"pipeline_stages": [
{
"drop": {
"expression": '.* "GET / HTTP/1.0" 200 .*'
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zope-*-Z2.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-long-request-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^(?P<timestamp>.*) .*"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/longrequest_logger_zope-*.log"],
application,
)})
if partition['type'] in ('erp5/mariadb', 'mariadb'):
job_name = f"{application['name']}-{partition['name']}-mariadb-slow-queries"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": ".*SET timestamp=(?P<timestamp>\\d+);.*"
}
},
{
"timestamp": {
"format": "Unix",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/mariadb_slowquery.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-mariadb-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"timestamp": {
"format": "2021-06-05 3:55:31",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/mariadb_error.log"],
application,
)})
if partition['type'] == 'erp5/zeo':
job_name = f"{application['name']}-{partition['name']}-zeo-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None,
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/zeo-*.log"],
application,
)})
if partition['type'] == 'erp5/balancer':
job_name = f"{application['name']}-{partition['name']}-balancer-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/apache-access.log"],
application,
)})
job_name = f"{application['name']}-{partition['name']}-balancer-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
["{directory}/var/log/apache-error.log"],
application,
)})
if partition.get('log-file-patterns'):
job_name = f"{application['name']}-{partition['name']}"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
partition['log-file-patterns'],
application,
)})
self._config_files[options['location']] = json.dumps(cfg, indent=2)
options['access-path-dict'] = access_path_dict
install =
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
{% endraw %}
[promtail-listen-promise]
<= check-port-listening-promise
hostname = ${promtail:ip}
port = ${promtail:http-port}
[telegraf-listen-promise]
recipe = slapos.cookbook:wrapper
command-line =
test -S ${telegraf-config-file:input-socket}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[facl-script]
recipe = slapos.recipe.build
promtail-access-path-dict = ${promtail-config-file:access-path-dict}
telegraf-access-path-dict = ${telegraf-config-file:access-path-dict}
install =
import itertools
import os
import pathlib
import pwd
import shlex
user = pwd.getpwuid(os.getuid()).pw_name
script_code = ''
def quote_path(p):
# quote, but preserve *
p = str(p)
assert '__STAR__' not in p
p = p.replace('*', '__STAR__')
p = shlex.quote(p)
p = p.replace('__STAR__', '*')
return p
# make sure we can access the parents folders
parent_access = {}
def check_parent_access(path):
parent = path.parent
if parent != path:
parent_access[str(parent)] = 'x'
check_parent_access(parent)
for path_spec, access in itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items()):
path = pathlib.Path(path_spec)
check_parent_access(path)
for path_spec, access in sorted(itertools.chain(
options['promtail-access-path-dict'].items(),
options['telegraf-access-path-dict'].items(),
parent_access.items())):
path = pathlib.Path(path_spec)
if '*' in path_spec:
script_code += f'setfacl --modify=u:{user}:rx {quote_path(path.parent)}\n'
script_code += f'setfacl --modify=u:{user}:{access} {quote_path(path)}\n'
pathlib.Path(location).write_text(script_code)
[promises]
recipe =
instance-promises =
${promtail-listen-promise:path}
${telegraf-listen-promise:wrapper-path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
facl-script = ${facl-script:location}
promtail-url = ${promtail:url}
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
"description": "Parameters to instantiate Grafana", "description": "Parameters to instantiate Grafana",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,
"properties": {
"email": {
"type": "object",
"description": "Email configuration",
"additionalProperties": false,
"properties": { "properties": {
"smtp-server": { "smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.", "description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
...@@ -17,23 +22,49 @@ ...@@ -17,23 +22,49 @@
"type": "string" "type": "string"
}, },
"smtp-verify-ssl": { "smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server", "description": "Verify certificate of SMTP server",
"type": "boolean" "type": "boolean",
"default": true
}, },
"email-from-address": { "email-from-address": {
"description": "Email address used in From: header of emails", "description": "Email address used in `From:` header of emails",
"type": "string" "type": "string"
}, },
"email-from-name": { "email-from-name": {
"description": "Name used in From: header of emails", "description": "Name used in `From:` header of emails",
"default": "Grafana", "default": "Grafana",
"type": "string" "type": "string"
}
}
},
"frontend": {
"type": "object",
"additionalProperties": false,
"properties": {
"custom-domain": {
"description": "Custom domain to use when requesting a rapid-cdn frontend",
"type": "string",
"format": "hostname"
}
}
}, },
"caucase-url": { "caucase": {
"description": "URL of a caucase instance to manage all server and clients certificates", "type": "object",
"description": "Caucase configuration. To connect external agents, it's required to approve their client certificates, either using an external caucase referenced as `external-caucase-url` or registering a user with `user-auto-approve-count`",
"additionalProperties": false,
"properties": {
"external-caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates, to use instead of embedding caucase",
"type": "string", "type": "string",
"format": "uri" "format": "uri"
}, },
"user-auto-approve-count": {
"description": "Number of users to automatically approve in the embedded caucase",
"type": "integer",
"default": 0
}
}
},
"influxdb": { "influxdb": {
"description": "Fine tuning influxdb parameters", "description": "Fine tuning influxdb parameters",
"type": "object", "type": "object",
......
{ {
"$schema": "http://json-schema.org/draft-07/schema#", "$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by Grafana instantiation", "description": "Values returned by Grafana instantiation",
"additionalProperties": false,
"properties": { "properties": {
"url": { "url": {
"description": "Shared frontend for this Grafana instance", "description": "Shared frontend for this Grafana instance",
...@@ -47,6 +46,15 @@ ...@@ -47,6 +46,15 @@
"description": "URL caucase service used by Loki", "description": "URL caucase service used by Loki",
"format": "uri", "format": "uri",
"type": "string" "type": "string"
},
"agent-promtail-url": {
"description": "URL of embedded server from promtail",
"format": "uri",
"type": "string"
},
"agent-facl-script": {
"description": "Path of a generated script to set ACL for the agent to access files and sockets. This might be needed depending on how slapos partitions were formatted",
"type": "string"
} }
}, },
"type": "object" "type": "object"
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
publish-connection-parameter
eggs-directory = {{ buildout_eggs_directory }}
develop-eggs-directory = {{ buildout_develop_eggs_directory }}
offline = true
[instance-parameter]
recipe = slapos.cookbook:slapconfiguration
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
grafana-dashboards-dir = ${:grafana-dir}/dashboards
loki-dir = ${:srv}/loki
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
# macros
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout_parts_directory }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION {{ slapparameter_dict.get('influxdb', {}).get('default-retention-policy-days', 720) }}d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
provisioning-dashboards-dir = ${directory:grafana-provisioning-dashboards-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-files =
${grafana-config-file:output}
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-insecure-self-signed-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": options['influxdb-database'],
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
rendered = ${grafana:provisioning-dashboards-dir}/dashboard.yaml
context =
key dashboards_dir directory:grafana-dashboards-dir
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-server-listen-promise]
<= check-url-available-promise
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout_bin_directory,
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
{% if slapparameter_dict.get('caucase', {}).get('external-caucase-url') %}
[loki-caucased]
url = {{ slapparameter_dict.get('caucase', {}).get('external-caucase-url') }}
{% else %}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count is 4 for the default:
# - server: loki
# - clients: loki promise, grafana, promtail
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout_bin_directory,
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=4,
user_auto_approve_count='${loki-caucased-user-auto-approve-count:user-auto-approve-count}',
key_len=2048,
)}}
[loki-caucased-user-auto-approve-count]
user-auto-approve-count = {{ slapparameter_dict.get('caucase', {}).get('user-auto-approve-count', 0) }}
{% endif %}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
{% if slapparameter_dict.get('frontend', {}).get('custom-domain') %}
config-custom_domain = {{ slapparameter_dict['frontend']['custom-domain'] }}
{% endif %}
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[request-agent-config]
recipe = slapos.recipe.build
init =
slap_connection = self.buildout["slap-connection"]
configuration = self.buildout['slap-configuration']['configuration']
applications = configuration.get('agent', {}).get('applications', [])
applications.append(
# Add a default config ingesting grafana's and influxdb's logs
{
"name": "Grafana",
"type": "system",
"partitions": [
{
"name": "grafana",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
]
},
{
"name": "influxdb",
"static-tags": {
"partition_reference": slap_connection['partition-id'],
},
"log-file-patterns": [
f"{self.buildout['directory']['home']}/.*_grafana*.log",
]
},
]
}
)
options['applications'] = applications
options['loki'] = {
'url': self.buildout['loki-server']['url'],
'caucase-url': self.buildout['loki-caucased']['url'],
}
options['influxdb'] = {
"url": self.buildout['influxdb']['url'],
"database": self.buildout['influxdb']['database'],
"username": self.buildout['influxdb']['auth-username'],
"password": self.buildout['influxdb']['auth-password'],
}
[request-slapos-partition-base]
recipe = slapos.cookbook:request.serialised
software-url = ${slap-connection:software-release-url}
server-url = ${slap-connection:server-url}
key-file = ${slap-connection:key-file}
cert-file = ${slap-connection:cert-file}
computer-id = ${slap-connection:computer-id}
partition-id = ${slap-connection:partition-id}
[request-agent]
<= request-slapos-partition-base
software-type = agent
name = agent
return = facl-script promtail-url
config-applications = ${request-agent-config:applications}
config-loki = ${request-agent-config:loki}
config-influxdb = ${request-agent-config:influxdb}
[agent-promtail-url]
recipe = slapos.cookbook:urlparse
url = ${request-agent:connection-promtail-url}
[agent-promtail-listen-promise]
<= check-port-listening-promise
hostname = ${agent-promtail-url:host}
port = ${agent-promtail-url:port}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${apache-frontend-available-promise:path}
${agent-promtail-listen-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
url = ${apache-frontend:connection-secure_access}
agent-facl-script = ${request-agent:connection-facl-script}
agent-promtail-url = ${request-agent:connection-promtail-url}
{% import "caucase" as caucase with context %}
[buildout] [buildout]
parts = parts = switch-softwaretype
promises
publish-connection-parameter
eggs-directory = {{ buildout['eggs-directory'] }} eggs-directory = {{ buildout['eggs-directory'] }}
develop-eggs-directory = {{ buildout['develop-eggs-directory'] }} develop-eggs-directory = {{ buildout['develop-eggs-directory'] }}
offline = true offline = true
[jinja2-template-base]
[instance-parameter] recipe = slapos.recipe.template:jinja2
recipe = slapos.cookbook:slapconfiguration filename = ${:_buildout_section_name_}.cfg
computer = ${slap-connection:computer-id} output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:filename}
partition = ${slap-connection:partition-id} extensions =
url = ${slap-connection:server-url} jinja2.ext.do
key = ${slap-connection:key-file} extra-context =
cert = ${slap-connection:cert-file} context =
raw buildout_bin_directory {{ buildout['bin-directory'] }}
raw buildout_parts_directory {{ buildout['parts-directory'] }}
raw buildout_eggs_directory {{ buildout['eggs-directory'] }}
raw buildout_develop_eggs_directory {{ buildout['develop-eggs-directory'] }}
key slapparameter_dict slap-configuration:configuration
raw instance_default {{ instance_default }}
raw instance_agent {{ instance_agent }}
raw openssl_bin {{ openssl_bin }}
raw telegraf_bin {{ telegraf_bin }}
raw telegraf_input_slapos_bin {{ telegraf_input_slapos_bin }}
raw influxd_bin {{ influxd_bin }}
raw influx_bin {{ influx_bin }}
raw grafana_bin {{ grafana_bin }}
raw grafana_homepath {{ grafana_homepath }}
raw loki_bin {{ loki_bin }}
raw promtail_bin {{ promtail_bin }}
raw curl_bin {{ curl_bin }}
raw dash_bin {{ dash_bin }}
raw jq_bin {{ jq_bin }}
import-list =
file caucase context:caucase-jinja2-library
[context]
caucase-jinja2-library = {{ caucase_jinja2_library }}
[instance-default]
<= jinja2-template-base
url = {{ instance_default }}
[instance-agent]
<= jinja2-template-base
url = {{ instance_agent }}
[switch-softwaretype]
recipe = slapos.cookbook:switch-softwaretype
default = instance-default:output
RootSoftwareInstance = ${:default}
agent = instance-agent:output
[slap-configuration] [slap-configuration]
# apache-frontend reads from from a part named [slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id} computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id} partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url} url = ${slap-connection:server-url}
key = ${slap-connection:key-file} key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file} cert = ${slap-connection:cert-file}
[directory]
recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
influxdb-data-dir = ${:srv}/influxdb
grafana-dir = ${:srv}/grafana
grafana-data-dir = ${:grafana-dir}/data
grafana-logs-dir = ${:var}/log
grafana-plugins-dir = ${:grafana-dir}/plugins
grafana-provisioning-config-dir = ${:grafana-dir}/provisioning-config
grafana-provisioning-datasources-dir = ${:grafana-provisioning-config-dir}/datasources
grafana-provisioning-dashboards-dir = ${:grafana-provisioning-config-dir}/dashboards
grafana-dashboards-dir = ${:grafana-dir}/dashboards
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
loki-dir = ${:srv}/loki
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
then
{{ openssl_bin }} req -x509 -nodes -sha256 -days 3650 \
-subj "/C=AA/ST=X/L=X/O=Dis/CN=${:common-name}" \
-newkey rsa -keyout ${:key-file} \
-out ${:cert-file}
fi
update-command = ${:command}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
[config-file]
recipe = slapos.recipe.template:jinja2
url = {{ buildout['parts-directory'] }}/${:_buildout_section_name_}/${:_buildout_section_name_}.cfg.in
output = ${directory:etc}/${:_buildout_section_name_}.cfg
extensions = jinja2.ext.do
[check-port-listening-promise]
recipe = slapos.cookbook:check_port_listening
path = ${directory:promise}/${:_buildout_section_name_}
[check-url-available-promise]
recipe = slapos.cookbook:check_url_available
path = ${directory:promise}/${:_buildout_section_name_}
dash_path = {{ dash_bin }}
curl_path = {{ curl_bin }}
[influxdb]
ipv6 = ${instance-parameter:ipv6-random}
ipv4 = ${instance-parameter:ipv4-random}
host = ${:ipv6}
local-host = ${:ipv4}
rpc-port = 8088
http-port = 8086
url = https://[${:host}]:${:http-port}
data-dir = ${directory:influxdb-data-dir}
auth-username = ${influxdb-password:username}
auth-password = ${influxdb-password:passwd}
unix-socket = ${directory:var}/influxdb.socket
ssl-cert-file = ${influxdb-certificate:cert-file}
ssl-key-file = ${influxdb-certificate:key-file}
database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
<= config-file
context =
section influxdb influxdb
[influxdb-password]
recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
hostname = ${influxdb:ipv6}
port = ${influxdb:http-port}
[influxdb-password-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ influx_bin }} -username ${influxdb:auth-username} -password ${influxdb:auth-password} -socket ${influxdb:unix-socket} -execute "CREATE USER ${influxdb:auth-username} WITH PASSWORD '${influxdb:auth-password}' WITH ALL PRIVILEGES"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-database-ready-promise]
recipe = slapos.cookbook:wrapper
command-line =
bash -c "{{ influx_bin }} \
-username ${influxdb:auth-username} \
-password ${influxdb:auth-password} \
-host [${influxdb:host}] \
-port ${influxdb:http-port} \
-unsafeSsl \
-ssl \
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
# TODO: actually use parameter
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION 720d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
port = 8180
url = https://[${:ipv6}]:${:port}
data-dir = ${directory:grafana-data-dir}
logs-dir = ${directory:grafana-logs-dir}
plugins-dir = ${directory:grafana-plugins-dir}
provisioning-config-dir = ${directory:grafana-provisioning-config-dir}
provisioning-datasources-dir = ${directory:grafana-provisioning-datasources-dir}
provisioning-dashboards-dir = ${directory:grafana-provisioning-dashboards-dir}
admin-user = ${grafana-password:username}
admin-password = ${grafana-password:passwd}
secret-key = ${grafana-secret-key:passwd}
ssl-key-file = ${grafana-certificate:key-file}
ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-insecure-self-signed-certificate
[grafana-password]
recipe = slapos.cookbook:generate.password
username = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
[grafana-config-file]
<= config-file
context =
section grafana grafana
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": "telegraf",
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
rendered = ${grafana:provisioning-dashboards-dir}/dashboard.yaml
context =
key dashboards_dir directory:grafana-dashboards-dir
[grafana-listen-promise]
<= check-port-listening-promise
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
# telegraf needs influxdb to be already listening before starting
command-line =
bash -c '${influxdb-listen-promise:path} && nice -19 chrt --idle 0 ionice -c3 {{ telegraf_bin }} --config ${telegraf-config-file:output} --config-directory ${:extra-config-dir}'
wrapper-path = ${directory:service}/telegraf
[telegraf-config-file]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
init =
import zc.buildout
import pkg_resources
buildout_options = self.buildout["buildout"]
zc.buildout.easy_install.install(
["toml"],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options["develop-eggs-directory"],
buildout_options["eggs-directory"],
],
)
import collections
import os.path
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb']
# files to create during install step
self._config_files = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
# v TODO remove agent
for application in slapparameter_dict.get("agent", {}).get("applications", []):
partition_mapping = {}
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
if "reference" in partition:
partition_mapping[partition["reference"]] = partition["name"]
partition_directory = os.path.join(application["instance-root"], partition['reference'])
if partition["type"] in ("erp5/mariadb", "mariadb"):
partition.setdefault("username", "root")
partition.setdefault("dbname", "erp5")
dsn = f"{partition['username']}@unix({partition_directory}/var/run/mariadb.sock)/{partition['dbname']}"
inputs["mysql"].append(
{
"name_override": f"{partition['name']}-mysql",
"servers": [dsn],
"gather_innodb_metrics": True,
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
inputs["sql"].append(
{
"name_override": f"{partition['name']}-activities",
"driver": "mysql",
"dsn": dsn,
"query": [
{
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/balancer":
inputs["tail"].append(
{
"data_format": "grok",
"files": [f"{partition_directory}/var/log/apache-access.log"],
"grok_custom_pattern_files": [],
"grok_custom_patterns": "",
"grok_patterns": [
'%{IPORHOST:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \\[%{HTTPDATE:timestamp}\\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) %{QS:referrer} %{QS:agent} %{NUMBER:response_time:int}'
],
"grok_timezone": "Local",
"name_override": f"{partition['name']}",
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
urls = application.get("urls", [])
if urls:
inputs["http_response"].append({
"interval": "5m",
"urls": urls,
"tags": {"app": application["name"]},
})
for url in urls:
x509_url = url
parsed_url = urllib.parse.urlparse(url)
if parsed_url.scheme == 'https':
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
# TODO some kind of GET request every X minutes ?
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = os.path.join(
self.options['location'],
f"telegraf-input-slapos-{application['name']}.cfg")
self._config_files[telegraf_slapos_input_config_file] = toml.dumps({
"inputs": {
"slapos": [{
"instance_root": application["instance-root"]}]}})
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": f"{application['name']}-processes",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": [f"{application['name']}-processes"],
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 3,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 4,
"tags": [{
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": [ f"{application['name']}-processes"],
"order": 6,
"mapping": [{
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
self._config_files[options['output']] = toml.dumps(config)
install =
import os
os.mkdir(self.options['location'])
for fname, content in self._config_files.items():
with open(fname, 'w') as f:
f.write(content)
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-server-listen-promise]
<= check-url-available-promise
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
# agent + server
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
# req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
# [ req_ext ]
# subjectAltName = @alt_names
# [ alt_names ]
# IP.1 = ${loki-server:ipv4}
# IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
# agent
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count needs:
# server: loki
# clients: loki promise, grafana, promtail
# TODO: this is bad default
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout['bin-directory'],
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=5,
user_auto_approve_count=1,
key_len=2048,
)}}
[promtail]
recipe = slapos.cookbook:wrapper
command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ promtail_bin }} -config.file=${promtail-config-file:location}'
wrapper-path = ${directory:service}/promtail
dir = ${directory:promtail-dir}
http-port = 19080
grpc-port = 19095
ip = ${instance-parameter:ipv4-random}
url = http://${:ip}:${:http-port}
[promtail-config-file]
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.cfg
slapparameter-dict = ${slap-configuration:configuration}
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
install =
import os
# XXX make extra eggs available to buildout
import zc.buildout
import pkg_resources
buildout_options = self.buildout['buildout']
zc.buildout.easy_install.install(
['pyyaml'],
dest=None,
working_set=pkg_resources.working_set,
path=[
buildout_options['develop-eggs-directory'],
buildout_options['eggs-directory']])
import yaml
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
"http_listen_port": int(self.buildout['promtail']['http-port']),
"grpc_listen_address": self.buildout['promtail']['ip'],
"grpc_listen_port": int(self.buildout['promtail']['grpc-port']),
"graceful_shutdown_timeout": 5,
"external_url": self.buildout['promtail']['url'],
},
"positions": {
"filename": "{}/positions.yaml".format(self.buildout['promtail']['dir']),
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": self.buildout['loki-server']['ca-file'],
"cert_file": self.buildout['loki-promtail-client-certificate']['cert-file'],
"key_file": self.buildout['loki-promtail-client-certificate']['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
"scrape_configs": []
}
def get_job_selector(partition, job_name, application_name):
# make a selector in LogQL, like '{job="job_name",key="value"}'
selector_parts = [f'app="{application_name}"']
for k, v in dict(partition.get('static-tags', {}), job=job_name).items():
selector_parts.append(f'{k}="{v}"')
return "{%s}" % ",".join(selector_parts)
def get_static_configs(partition, job_name, path, application):
directory = ''
if partition.get('reference') and 'instance-root' in path:
directory = os.path.join(application['instance-root'], partition['reference'])
return [
{
"targets": [
"localhost"
],
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
app=application['name'],
name=partition['name'],
partition_reference=partition['reference'],
computer_id=slap_connection['computer-id'],
__path__=path.format(directory=directory),
)
}
]
# Add grafana and influxdb own logs. TODO: not in agent mode
cfg['scrape_configs'].append(
{
"job_name": "Grafana",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Grafana", "reference": slap_connection['partition-id']},
"Grafana",
f"{self.buildout['directory']['home']}/.*_grafana*.log",
{"name": "Grafana"}
)
}
)
cfg['scrape_configs'].append(
{
"job_name": "Influxdb",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Influxdb", "reference": slap_connection['partition-id']},
"Influxdb",
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
{"name": "Grafana"},
)
}
)
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
if partition['type'] in ('erp5/zope-activity', 'erp5/zope-front'):
job_name = f"{partition['name']}-event-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
# TODO don't include the ----
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-event.log",
application,
)})
if partition['type'] == 'erp5/zope-front':
job_name = f"{partition['name']}-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
# drop requests for haproxy health check
"pipeline_stages": [
{
"drop": {
"expression": '.* "GET / HTTP/1.0" 200 .*'
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-Z2.log",
application,
)})
job_name = f"{partition['name']}-long-request-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^(?P<timestamp>.*) .*"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zope-*-longrequest.log",
application,
)})
if partition['type'] in ('erp5/mariadb', 'mariadb'):
job_name = f"{partition['name']}-mariadb-slow-queries"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": ".*SET timestamp=(?P<timestamp>\\d+);.*"
}
},
{
"timestamp": {
"format": "Unix",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/mariadb_slowquery.log",
application,
)})
job_name = f"{partition['name']}-mariadb-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"timestamp": {
"format": "2021-06-05 3:55:31",
"source": "timestamp"
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/mariadb_error.log",
application,
)})
if partition['type'] == 'erp5/zeo':
job_name = f"{partition['name']}-zeo-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"pipeline_stages": [
{
"match": {
"selector": get_job_selector(partition, job_name, application['name']),
"stages": [
{
"multiline": {
"firstline": "^------",
"max_wait_time": "3s"
}
},
{
"regex": {
"expression": "^------\\n(?P<timestamp>\\d{4}-\\d{2}-\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}\\,\\d{3}) (?P<level>\\S+) (?P<component>\\S+) (?P<message>.*)"
}
},
{
"timestamp": {
"format": "2021-04-04 03:57:11,242",
"source": "timestamp"
}
},
{
"labels": {
"level": None,
}
}
]
}
}
],
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/zeo-*.log",
application,
)})
if partition['type'] == 'erp5/balancer':
job_name = f"{partition['name']}-balancer-access-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/apache-access.log",
application,
)})
job_name = f"{partition['name']}-balancer-error-log"
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
"{directory}/var/log/apache-error.log",
application,
)})
if partition.get('log-file-patterns'):
job_name = partition['name']
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
f"{partition['log-file-patterns']}",
application,
)})
with open(self.options['location'], 'w') as f:
yaml.dump(cfg, f)
{% endraw %}
[promtail-listen-promise]
<= check-port-listening-promise
hostname= ${promtail:ip}
port = ${promtail:http-port}
[apache-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = Grafana Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
return = domain secure_access
[apache-frontend-available-promise]
<= check-url-available-promise
url = ${apache-frontend:connection-secure_access}
[promises]
recipe =
instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${promtail-listen-promise:path}
${apache-frontend-available-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
influxdb-password = ${influxdb:auth-password}
telegraf-extra-config-dir = ${telegraf:extra-config-dir}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
promtail-url = ${promtail:url}
url = ${apache-frontend:connection-secure_access}
...@@ -129,15 +129,18 @@ url = ${:_profile_base_location_}/${:filename} ...@@ -129,15 +129,18 @@ url = ${:_profile_base_location_}/${:filename}
[grafana-provisioning-dashboards-config-file] [grafana-provisioning-dashboards-config-file]
<= download-file-base <= download-file-base
[loki-config-file]
<= download-file-base
[instance-eggs] [instance-eggs]
recipe = zc.recipe.egg recipe = zc.recipe.egg
eggs = eggs =
${python-PyYAML:egg}
toml toml
[instance-agent]
<= download-file-base
[instance-default]
<= download-file-base
[instance-profile] [instance-profile]
recipe = slapos.recipe.template:jinja2 recipe = slapos.recipe.template:jinja2
url = ${:_profile_base_location_}/${:filename} url = ${:_profile_base_location_}/${:filename}
...@@ -145,6 +148,8 @@ output = ${buildout:directory}/instance.cfg ...@@ -145,6 +148,8 @@ output = ${buildout:directory}/instance.cfg
extensions = jinja2.ext.do extensions = jinja2.ext.do
context = context =
section buildout buildout section buildout buildout
key instance_default instance-default:target
key instance_agent instance-agent:target
key openssl_bin openssl-output:openssl key openssl_bin openssl-output:openssl
key telegraf_bin gowork:telegraf-bin key telegraf_bin gowork:telegraf-bin
key telegraf_input_slapos_bin gowork:telegraf-input-slapos-bin key telegraf_input_slapos_bin gowork:telegraf-input-slapos-bin
...@@ -157,13 +162,12 @@ context = ...@@ -157,13 +162,12 @@ context =
key curl_bin :curl-bin key curl_bin :curl-bin
key dash_bin :dash-bin key dash_bin :dash-bin
key jq_bin :jq-bin key jq_bin :jq-bin
key caucase_jinja2_library caucase-jinja2-library:target
curl-bin = ${curl:location}/bin/curl curl-bin = ${curl:location}/bin/curl
dash-bin = ${dash:location}/bin/dash dash-bin = ${dash:location}/bin/dash
jq-bin = ${jq:location}/bin/jq jq-bin = ${jq:location}/bin/jq
depends = ${instance-eggs:eggs} ${caucase-eggs:eggs} depends = ${instance-eggs:eggs} ${caucase-eggs:eggs}
import-list =
file caucase caucase-jinja2-library:target
[versions] [versions]
inotifyx = 0.2.2
toml = 0.10.2 toml = 0.10.2
...@@ -53,8 +53,7 @@ class GrafanaTestCase(SlapOSInstanceTestCase): ...@@ -53,8 +53,7 @@ class GrafanaTestCase(SlapOSInstanceTestCase):
Since the instances takes time to start and stop, Since the instances takes time to start and stop,
we increase the number of retries. we increase the number of retries.
""" """
# instance_max_retry = 50 instance_max_retry = 50
instance_max_retry = 30 # TODO
report_max_retry = 30 report_max_retry = 30
...@@ -117,6 +116,8 @@ class TestGrafana(GrafanaTestCase): ...@@ -117,6 +116,8 @@ class TestGrafana(GrafanaTestCase):
if loki_health.get('data'): if loki_health.get('data'):
break break
time.sleep(retry) time.sleep(retry)
else:
self.fail(loki_health)
self.assertEqual(loki_health['status'], "success") self.assertEqual(loki_health['status'], "success")
self.assertIn("app", loki_health['data']) self.assertIn("app", loki_health['data'])
...@@ -132,19 +133,20 @@ class TestGrafana(GrafanaTestCase): ...@@ -132,19 +133,20 @@ class TestGrafana(GrafanaTestCase):
class TestGrafanaEmailEnabled(GrafanaTestCase): class TestGrafanaEmailEnabled(GrafanaTestCase):
__partition_reference__ = 'mail' __partition_reference__ = 'mail'
smtp_verify_ssl = "true" smtp_verify_ssl = True
smtp_skip_verify = "false" smtp_skip_verify = "false"
@classmethod @classmethod
def getInstanceParameterDict(cls): def getInstanceParameterDict(cls):
return json.dumps({"_": { return {"_": json.dumps({
"email": {
"smtp-server": "smtp.example.com:25", "smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username", "smtp-username": "smtp_username",
"smtp-password": "smtp_password", "smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl, 'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com", "email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name", "email-from-name": "Grafana From Name",
}}) }})}
def test_email_enabled(self): def test_email_enabled(self):
config = configparser.ConfigParser() config = configparser.ConfigParser()
...@@ -163,7 +165,7 @@ class TestGrafanaEmailEnabled(GrafanaTestCase): ...@@ -163,7 +165,7 @@ class TestGrafanaEmailEnabled(GrafanaTestCase):
class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled): class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled):
smtp_verify_ssl = "false" smtp_verify_ssl = False
smtp_skip_verify = "true" smtp_skip_verify = "true"
...@@ -212,9 +214,14 @@ class TestTelegraf(GrafanaTestCase): ...@@ -212,9 +214,14 @@ class TestTelegraf(GrafanaTestCase):
"instance-root": cls.slap._instance_root, "instance-root": cls.slap._instance_root,
"partitions": [ "partitions": [
{ {
"name": "test grafana - partition name", "name": "test grafana - default partition",
"type": "default", "type": "default",
"reference": "G0" "reference": "G0", # XXX assumes partitions will be allocated in order
},
{
"name": "test grafana - agent partition",
"type": "default",
"reference": "G1"
}, },
], ],
}, },
...@@ -223,6 +230,10 @@ class TestTelegraf(GrafanaTestCase): ...@@ -223,6 +230,10 @@ class TestTelegraf(GrafanaTestCase):
} }
return {'_': json.dumps(parameter_dict)} return {'_': json.dumps(parameter_dict)}
def setUp(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
def test_telegraf_running(self): def test_telegraf_running(self):
with self.slap.instance_supervisor_rpc as supervisor: with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo() all_process_info = supervisor.getAllProcessInfo()
...@@ -230,9 +241,6 @@ class TestTelegraf(GrafanaTestCase): ...@@ -230,9 +241,6 @@ class TestTelegraf(GrafanaTestCase):
self.assertEqual(process_info['statename'], 'RUNNING') self.assertEqual(process_info['statename'], 'RUNNING')
def test_telegraf_ingest_slapos_metrics(self): def test_telegraf_ingest_slapos_metrics(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
# wait for data to be ingested # wait for data to be ingested
time.sleep(16) time.sleep(16)
...@@ -264,27 +272,27 @@ class TestTelegraf(GrafanaTestCase): ...@@ -264,27 +272,27 @@ class TestTelegraf(GrafanaTestCase):
if resp.ok and resp.json()['results'][0].get('series'): if resp.ok and resp.json()['results'][0].get('series'):
break break
time.sleep(i) time.sleep(i)
else:
self.fail(resp.text)
series = resp.json()['results'][0].get('series') series = resp.json()['results'][0].get('series')
print(series)
breakpoint()
# hashes and "-on-watch" is removed from process_name # hashes and "-on-watch" is removed from process_name
self.asserIn('grafana', [s['tags']['process_name'] for s in series]) self.assertIn('grafana', [s['tags']['process_name'] for s in series])
self.asserIn('telegraf', [s['tags']['process_name'] for s in series]) self.assertIn('telegraf', [s['tags']['process_name'] for s in series])
self.asserIn('loki-service', [s['tags']['process_name'] for s in series]) self.assertIn('loki-service', [s['tags']['process_name'] for s in series])
self.asserIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series]) self.assertIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
tags = [s['tags'] for s in series][0] tags = [s['tags'] for s in series if s['tags']['partition_reference'] == 'G0'][0]
self.assertEqual(tags['name'], 'test grafana - partition name') self.assertEqual(tags['name'], 'test grafana - default partition')
self.assertEqual(tags['computer_id'], self.slap._computer_id) self.assertEqual(tags['computer_id'], self.slap._computer_id)
self.assertEqual(tags['partition_reference'], 'G0')
self.assertEqual( self.assertEqual(
set([s['tags']['partition_reference'] for s in series]), {s['tags']['partition_reference'] for s in series},
{'G0'}, {'G0', 'G1'},
) )
self.fail('TODO')
class TestLoki(GrafanaTestCase): class TestLoki(GrafanaTestCase):
@classmethod @classmethod
...@@ -296,12 +304,10 @@ class TestLoki(GrafanaTestCase): ...@@ -296,12 +304,10 @@ class TestLoki(GrafanaTestCase):
"applications": [ "applications": [
{ {
"name": "TestLoki", "name": "TestLoki",
# "instance-root": "/", # XXX needed ?
"partitions": [ "partitions": [
{ {
# no slapos for system application
"name": "test log file", "name": "test log file",
"log-file-patterns": cls._logfile.name, "log-file-patterns": [cls._logfile.name],
"static-tags": { "static-tags": {
"testtag": "foo", "testtag": "foo",
}, },
...@@ -352,14 +358,17 @@ class TestLoki(GrafanaTestCase): ...@@ -352,14 +358,17 @@ class TestLoki(GrafanaTestCase):
if result := resp.json().get('data', {}).get('result', []): if result := resp.json().get('data', {}).get('result', []):
break break
time.sleep(i) time.sleep(i)
else:
self.fail(resp.text)
self.assertEqual( self.assertEqual(
result[0]['stream'], result[0]['stream'],
{ {
'app': 'TestLoki', 'app': 'TestLoki',
'computer_id': self.slap._computer_id,
'detected_level': 'info', 'detected_level': 'info',
'filename': self._logfile.name, 'filename': self._logfile.name,
'job': 'test log file', 'job': 'TestLoki-test log file',
'partition': 'test log file', 'name': 'test log file',
'service_name': 'TestLoki', 'service_name': 'TestLoki',
'testtag': 'foo', 'testtag': 'foo',
} }
...@@ -404,13 +413,13 @@ class TestListenInPartition(GrafanaTestCase): ...@@ -404,13 +413,13 @@ class TestListenInPartition(GrafanaTestCase):
c.laddr for c in self.process_dict['influxdb'].connections() c.laddr for c in self.process_dict['influxdb'].connections()
if c.status == 'LISTEN' if c.status == 'LISTEN'
]), ]),
[ sorted([
(self._ipv4_address, 8088), (self._ipv4_address, 8088),
(self.computer_partition_ipv6_address, 8086), (self.computer_partition_ipv6_address, 8086),
], ]),
) )
def test_telegraph_listen(self): def test_telegraf_listen(self):
self.assertEqual( self.assertEqual(
[ [
c.laddr for c in self.process_dict['telegraf'].connections() c.laddr for c in self.process_dict['telegraf'].connections()
...@@ -425,10 +434,10 @@ class TestListenInPartition(GrafanaTestCase): ...@@ -425,10 +434,10 @@ class TestListenInPartition(GrafanaTestCase):
c.laddr for c in self.process_dict['loki-service'].connections() c.laddr for c in self.process_dict['loki-service'].connections()
if c.status == 'LISTEN' if c.status == 'LISTEN'
]), ]),
[ sorted([
(self.computer_partition_ipv6_address, 3100),
(self._ipv4_address, 9095), (self._ipv4_address, 9095),
], (self.computer_partition_ipv6_address, 3100),
]),
) )
def test_promtail_listen(self): def test_promtail_listen(self):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment