Commit f39f1597 authored by Jérome Perrin's avatar Jérome Perrin

WIP grafana

parent 862073fb
...@@ -15,32 +15,17 @@ ...@@ -15,32 +15,17 @@
[instance-profile] [instance-profile]
filename = instance.cfg.in filename = instance.cfg.in
md5sum = 39a1ee09ca7a12995703ff2a6a869637 md5sum = e4d5ac3e6ad239d3bf48c2b3172919b5
[influxdb-config-file] [influxdb-config-file]
filename = influxdb-config-file.cfg.in filename = influxdb-config-file.cfg.in
md5sum = a28972ced3e0f4aa776e43a9c44717c0 md5sum = a28972ced3e0f4aa776e43a9c44717c0
[telegraf-config-file]
filename = telegraf-config-file.cfg.in
md5sum = 6de1faa34842e1eda095a51edecc2083
[grafana-config-file] [grafana-config-file]
filename = grafana-config-file.cfg.in filename = grafana-config-file.cfg.in
md5sum = 83a8445858eab21a12f1769c23424bea md5sum = 83a8445858eab21a12f1769c23424bea
[grafana-provisioning-datasources-config-file]
filename = grafana-provisioning-datasources-config-file.cfg.in
md5sum = 3aa0f1ed752b2a59ea2b5e7c1733daf3
[grafana-provisioning-dashboards-config-file] [grafana-provisioning-dashboards-config-file]
filename = grafana-provisioning-dashboards-config-file.cfg.in filename = grafana-provisioning-dashboards-config-file.cfg.in
md5sum = 5616679a9c5c2757540175ead3f5500a md5sum = 5616679a9c5c2757540175ead3f5500a
[loki-config-file]
filename = loki-config-file.cfg.in
md5sum = 19a7f5cb904b3287b0bc7cb3e8a27429
[loki-nginx-config-file]
filename = loki-nginx-config-file.cfg.in
md5sum = b08ce1e4abb34eb79e26133459c27c3a
# https://grafana.com/docs/administration/provisioning/#example-datasource-config-file
apiVersion: 1
datasources:
- name: telegraf
type: influxdb
access: proxy
url: {{ influxdb['url'] }}
user: {{ influxdb['auth-username'] }}
database: telegraf
isDefault: true
jsonData:
tlsSkipVerify: true
secureJsonData:
password: {{ influxdb['auth-password'] }}
version: 1
editable: false
- name: loki
type: loki
access: proxy
url: {{ loki['url'] }}
version: 1
editable: false
{ {
"$schema": "https://json-schema.org/draft/2020-12/schema", "$schema": "http://json-schema.org/draft-04/schema",
"description": "Parameters to instantiate Grafana", "description": "Parameters to instantiate an agent collecting logs and metrics",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,
"$defs": { "$defs": {
...@@ -42,32 +42,12 @@ ...@@ -42,32 +42,12 @@
] ]
} }
}, },
"required": [
"applications",
"influxdb",
"loki"
],
"properties": { "properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "boolean"
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"default": "Grafana",
"type": "string"
},
"applications": { "applications": {
"description": "Applications to monitor", "description": "Applications to monitor",
"type": "array", "type": "array",
...@@ -107,6 +87,7 @@ ...@@ -107,6 +87,7 @@
"name", "name",
"reference" "reference"
], ],
"additionalProperties": false,
"properties": { "properties": {
"name": { "name": {
"type": "string", "type": "string",
...@@ -174,6 +155,7 @@ ...@@ -174,6 +155,7 @@
}, },
{ {
"type": "object", "type": "object",
"additionalProperties": false,
"description": "Configuration for `system` type application", "description": "Configuration for `system` type application",
"required": [ "required": [
"type", "type",
...@@ -194,6 +176,7 @@ ...@@ -194,6 +176,7 @@
"type": "array", "type": "array",
"items": { "items": {
"type": "object", "type": "object",
"additionalProperties": false,
"properties": { "properties": {
"name": { "name": {
"type": "string", "type": "string",
...@@ -237,6 +220,57 @@ ...@@ -237,6 +220,57 @@
} }
] ]
} }
},
"influxdb": {
"description": "Connection information for influxdb",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"database",
"username",
"password"
],
"properties": {
"url": {
"description": "IPv6 URL of influxdb HTTP endpoint",
"format": "uri",
"type": "string"
},
"database": {
"description": "database created in influxdb",
"type": "string"
},
"username": {
"description": "username for influxdb",
"type": "string"
},
"password": {
"description": "password for influxdb user",
"type": "string"
}
}
},
"loki": {
"description": "Connection information for loki",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"caucase-url"
],
"properties": {
"url": {
"description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
}
}
} }
} }
} }
{
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by agent instantiation",
"additionalProperties": false,
"properties": {
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"type": "string"
}
},
"type": "object"
}
{
"$schema": "http://json-schema.org/draft-07/schema",
"description": "Parameters to instantiate Grafana",
"type": "object",
"additionalProperties": false,
"properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "boolean"
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"default": "Grafana",
"type": "string"
},
"caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates",
"type": "string",
"format": "uri"
},
"influxdb": {
"description": "Fine tuning influxdb parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"default-retention-policy-days": {
"description": "Number of days to keep metrics data",
"default": 720,
"type": "integer"
}
}
},
"loki": {
"description": "Fine tuning loki parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"retention-period-days": {
"description": "Number of days to keep log data",
"default": 60,
"type": "integer"
}
}
},
"agent": {
"type": "object",
"properties": {
"applications": {
"$ref": "./instance-agent-input-schema.json#properties/applications"
}
}
}
}
}
{ {
"$schema": "http://json-schema.org/draft-04/schema#", "$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by Grafana instantiation", "description": "Values returned by Grafana instantiation",
"additionalProperties": false, "additionalProperties": false,
"properties": { "properties": {
"url": { "url": {
"description": "Shared frontend for this Grafana instance", "description": "Shared frontend for this Grafana instance",
"pattern": "^https://", "format": "uri",
"type": "string" "type": "string"
}, },
"grafana-username": { "grafana-username": {
...@@ -18,12 +18,12 @@ ...@@ -18,12 +18,12 @@
}, },
"grafana-url": { "grafana-url": {
"description": "IPv6 URL to access grafana", "description": "IPv6 URL to access grafana",
"pattern": "^https://", "format": "uri",
"type": "string" "type": "string"
}, },
"influxdb-url": { "influxdb-url": {
"description": "IPv6 URL of influxdb HTTP endpoint", "description": "IPv6 URL of influxdb HTTP endpoint",
"pattern": "^https://", "format": "uri",
"type": "string" "type": "string"
}, },
"influxdb-database": { "influxdb-database": {
...@@ -38,8 +38,14 @@ ...@@ -38,8 +38,14 @@
"description": "password for influxdb user", "description": "password for influxdb user",
"type": "string" "type": "string"
}, },
"telegraf-extra-config-dir": { "loki-url": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern", "description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"loki-caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string" "type": "string"
} }
}, },
......
{% import "caucase" as caucase with context %}
[buildout] [buildout]
parts = parts =
promises promises
...@@ -30,6 +32,7 @@ recipe = slapos.cookbook:mkdirectory ...@@ -30,6 +32,7 @@ recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory} home = ${buildout:directory}
etc = ${:home}/etc etc = ${:home}/etc
var = ${:home}/var var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv srv = ${:home}/srv
service = ${:etc}/service service = ${:etc}/service
promise = ${:etc}/promise promise = ${:etc}/promise
...@@ -45,17 +48,19 @@ grafana-dashboards-dir = ${:grafana-dir}/dashboards ...@@ -45,17 +48,19 @@ grafana-dashboards-dir = ${:grafana-dir}/dashboards
telegraf-dir = ${:srv}/telegraf telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
loki-dir = ${:srv}/loki loki-dir = ${:srv}/loki
loki-boltdb-shipper-active-index-directory = ${:loki-dir}/index
loki-boltdb-shipper-cache-location = ${:loki-dir}/index-cache
loki-compactor-working-directory = ${:loki-dir}/compactor
loki-storage-filesystem-directory = ${:loki-dir}/chunks loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-nginx-dir = ${:srv}/loki-nginx loki-compactor-working-directory = ${:loki-dir}/compactor
loki-nginx-logs-dir = ${:loki-nginx-dir}/logs srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail promtail-dir = ${:srv}/promtail
# macros # macros
[generate-certificate] [generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command recipe = plone.recipe.command
command = command =
if [ ! -e ${:key-file} ] if [ ! -e ${:key-file} ]
...@@ -104,7 +109,7 @@ database = telegraf ...@@ -104,7 +109,7 @@ database = telegraf
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
command-line = command-line =
nice -19 chrt --idle 0 ionice -c3 {{ influxd_bin }} -config ${influxdb-config-file:output} {{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb wrapper-path = ${directory:service}/influxdb
[influxdb-config-file] [influxdb-config-file]
...@@ -117,7 +122,7 @@ recipe = slapos.cookbook:generate.password ...@@ -117,7 +122,7 @@ recipe = slapos.cookbook:generate.password
username = influxdb username = influxdb
[influxdb-certificate] [influxdb-certificate]
<= generate-certificate <= generate-insecure-self-signed-certificate
[influxdb-listen-promise] [influxdb-listen-promise]
<= check-port-listening-promise <= check-port-listening-promise
...@@ -143,6 +148,17 @@ command-line = ...@@ -143,6 +148,17 @@ command-line =
-execute 'show databases' | grep '${influxdb:database}'" -execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_} wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
# TODO: actually use parameter
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION 720d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana] [grafana]
ipv6 = ${instance-parameter:ipv6-random} ipv6 = ${instance-parameter:ipv6-random}
...@@ -163,17 +179,20 @@ ssl-cert-file = ${grafana-certificate:cert-file} ...@@ -163,17 +179,20 @@ ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
command-line = command-line =
{{ grafana_bin }} -config ${grafana-config-file:output} -homepath {{ grafana_homepath }} {{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana wrapper-path = ${directory:service}/grafana
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate] [grafana-certificate]
<= generate-certificate <= generate-insecure-self-signed-certificate
[grafana-password] [grafana-password]
# TODO recipe = slapos.cookbook:generate.password
#recipe = slapos.cookbook:generate.password
username = admin username = admin
passwd = admin
[grafana-secret-key] [grafana-secret-key]
recipe = slapos.cookbook:generate.password recipe = slapos.cookbook:generate.password
...@@ -185,15 +204,94 @@ context = ...@@ -185,15 +204,94 @@ context =
section apache_frontend apache-frontend section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration key slapparameter_dict slap-configuration:configuration
depends = depends =
${grafana-provisioning-datasources-config-file:output} ${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output} ${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file] [grafana-provisioning-datasources-config-file]
<= config-file recipe = slapos.recipe.build
output = ${grafana:provisioning-datasources-dir}/datasource.yaml init =
context = # pre-create location, so that we can use hash-existing-files
section influxdb influxdb import pathlib
section loki loki datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": "telegraf",
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file] [grafana-provisioning-dashboards-config-file]
<= config-file <= config-file
...@@ -206,6 +304,15 @@ context = ...@@ -206,6 +304,15 @@ context =
hostname= ${grafana:ipv6} hostname= ${grafana:ipv6}
port = ${grafana:port} port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[telegraf] [telegraf]
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir} extra-config-dir = ${directory:telegraf-extra-config-dir}
...@@ -215,14 +322,8 @@ command-line = ...@@ -215,14 +322,8 @@ command-line =
wrapper-path = ${directory:service}/telegraf wrapper-path = ${directory:service}/telegraf
[telegraf-config-file] [telegraf-config-file]
<= config-file
context =
section influxdb influxdb
section telegraf telegraf
section extra telegraf-config-file-extra
[telegraf-config-file-extra]
recipe = slapos.recipe.build recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }} telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration} slapparameter-dict = ${slap-configuration:configuration}
init = init =
...@@ -245,13 +346,56 @@ init = ...@@ -245,13 +346,56 @@ init =
import urllib.parse import urllib.parse
import toml import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb']
# files to create during install step # files to create during install step
self._config_files = {} self._config_files = {}
inputs = collections.defaultdict(list) inputs = collections.defaultdict(list)
processors = collections.defaultdict(list) processors = collections.defaultdict(list)
slapparameter_dict = self.options["slapparameter-dict"] config = {
for application in slapparameter_dict.get("applications", []): "agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
# v TODO remove agent
for application in slapparameter_dict.get("agent", {}).get("applications", []):
partition_mapping = {} partition_mapping = {}
for partition in application.get("partitions", []): for partition in application.get("partitions", []):
partition.setdefault("type", "default") partition.setdefault("type", "default")
...@@ -267,7 +411,12 @@ init = ...@@ -267,7 +411,12 @@ init =
"name_override": f"{partition['name']}-mysql", "name_override": f"{partition['name']}-mysql",
"servers": [dsn], "servers": [dsn],
"gather_innodb_metrics": True, "gather_innodb_metrics": True,
"tags": dict(partition.get("static-tags", {}), app=application["name"]), "tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
} }
) )
if partition["type"] == "erp5/mariadb": if partition["type"] == "erp5/mariadb":
...@@ -278,39 +427,44 @@ init = ...@@ -278,39 +427,44 @@ init =
"dsn": dsn, "dsn": dsn,
"query": [ "query": [
{ {
"query": "select count(*) as message_count from message", "query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"], "field_columns_include": ["message_count"],
}, "tag_columns_include": ["cmf_activity_queue"],
{
"query": "select count(*) as message_queue_count from message_queue",
"field_columns_include": ["message_queue_count"],
},
{
"query": "select count(*) as message_failed_count from message where processing_node=-2",
"field_columns_include": ["message_failed_count"],
},
{
"query": "select count(*) as message_queue_failed_count from message_queue where processing_node=-2",
"field_columns_include": ["message_queue_failed_count"],
}, },
{ {
"query": """ "query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int) select 'message' as cmf_activity_queue, count(*) as failed_message_count
as message_waiting_time from message from message where processing_node between -2 and -10
where processing_node in (-1, 0) and message not like '%after_tag%' union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""", """,
"field_columns_include": ["message_waiting_time"], "field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
}, },
{ {
"query": """ "query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int) select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as message_queue_waiting_time from message_queue as waiting_time, 'message_queue' as cmf_activity_queue
where processing_node in (-1, 0) and message not like '%after_tag%' from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""", """,
"field_columns_include": ["message_queue_waiting_time"], "field_columns_include": ["waiting_time"],
} "tag_columns_include": ["cmf_activity_queue"],
},
], ],
"tags": dict(partition.get("static-tags", {}), app=application["name"]), "tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
} }
) )
...@@ -326,7 +480,12 @@ init = ...@@ -326,7 +480,12 @@ init =
], ],
"grok_timezone": "Local", "grok_timezone": "Local",
"name_override": f"{partition['name']}", "name_override": f"{partition['name']}",
"tags": dict(partition.get("static-tags", {}), app=application["name"]), "tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
} }
) )
urls = application.get("urls", []) urls = application.get("urls", [])
...@@ -344,12 +503,13 @@ init = ...@@ -344,12 +503,13 @@ init =
# x509_cert wants a port # x509_cert wants a port
if not parsed_url.port: if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl() x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({ inputs["x509_cert"].append({
"sources": [x509_url], "sources": [x509_url],
"tags": {"url": url}, "tags": {"url": url},
"interval": "5h", "interval": "5h",
"tags": {"app": application["name"]}, "tags": {"app": application["name"]},
}) })
# TODO some kind of GET request every X minutes ?
if application.get("type") == "SlapOS": if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = os.path.join( telegraf_slapos_input_config_file = os.path.join(
...@@ -360,65 +520,69 @@ init = ...@@ -360,65 +520,69 @@ init =
"slapos": [{ "slapos": [{
"instance_root": application["instance-root"]}]}}) "instance_root": application["instance-root"]}]}})
# TODO: supervisor process finder for
# https://github.com/influxdata/telegraf/tree/master/plugins/inputs/procstat ?
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin'] telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({ inputs["execd"].append({
"name_override": f"{application['name']}-processes", "name_override": f"{application['name']}-processes",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file], "command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]}, "tags": {"app": application["name"]},
}) })
# "cleanup" slapos process names, remove hash from wrappers and -on-watch suffix # drop measurements for not monitored partitions.
processors["regex"].append({ processors["starlark"].append({
"namepass": [f"{application['name']}-processes"], "namepass": [f"{application['name']}-processes"],
"order": 1, "order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 3,
"tags": [{ "tags": [{
"key": "name", "key": "process_name",
"pattern": "^(.*)-.{32}", "pattern": "^(.*)-on-watch$",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}", "replacement": "$" + "{1}",
}]}) }]})
processors["regex"].append({ processors["regex"].append({
"namepass": [f"{application['name']}-processes"], "namepass": [f"{application['name']}-processes"],
"order": 2, "order": 4,
"tags": [{ "tags": [{
"key": "name", "key": "process_name",
"pattern": "^(.*)-on-watch$", "pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}", "replacement": "$" + "{1}",
}]}) }]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({ processors["enum"].append({
"namepass": [ f"{application['name']}-processes"], "namepass": [ f"{application['name']}-processes"],
"order": 6,
"mapping": [{ "mapping": [{
# "tag": "group", # TODO: rename this in input plugin # XXX I don't remember what this means "tag": "partition_reference",
"tag": "slappart", "dest": "name",
"dest": "partition",
"value_mappings": partition_mapping, "value_mappings": partition_mapping,
}]}) }]})
# TODOs: self._config_files[options['output']] = toml.dumps(config)
# - [ ] slapos input
# - [x] friendly name of slappart
# - [x] strip hashes from -on-watch
# - [x] activity metrics
# - [ ] alert dashboard
# - [ ] inclu "jerome-dev" partout ???
# - [ ] apdex
# - [ ] "job" is bad name in Explore
options["extra-config"] = toml.dumps({
"inputs": inputs,
"processors": processors})
# import pdb; pdb.set_trace()
# apdex
# SELECT sum("success") / sum("all") FROM
# (SELECT count("duration") AS "all" FROM "jerome-dev-balancer" WHERE $timeFilter GROUP BY time($__interval) fill(null)),
# (SELECT count("duration") AS "success" FROM "jerome-dev-balancer" WHERE ("resp_code" = '200' ) AND $timeFilter GROUP BY time($__interval) fill(null))
#SELECT sum("success") + sum("all") FROM
# (SELECT count("duration") AS "all" FROM "jerome-dev-balancer" WHERE $timeFilter GROUP BY time($__interval) fill(0)),
# (SELECT count("duration") AS "success" FROM "jerome-dev-balancer" WHERE ("resp_code" = '200' ) AND $timeFilter GROUP BY time($__interval) fill(0))
install = install =
import os import os
...@@ -427,180 +591,324 @@ install = ...@@ -427,180 +591,324 @@ install =
with open(fname, 'w') as f: with open(fname, 'w') as f:
f.write(content) f.write(content)
[loki-server]
[loki]
boltdb-shipper-active-index-directory = ${directory:loki-boltdb-shipper-active-index-directory}
boltdb-shipper-cache-location = ${directory:loki-boltdb-shipper-cache-location}
compactor-working-directory = ${directory:loki-compactor-working-directory}
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory} storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
ip = ${instance-parameter:ipv4-random} http-port = 3100
read-1-http-port = 3101 url = https://[${:ipv6}]:${:http-port}
read-1-grpc-port = 9096 ipv4 = ${instance-parameter:ipv4-random}
read-1-memberlist-port = 7947
read-2-http-port = 3102
read-2-grpc-port = 9097
read-2-memberlist-port = 7948
write-http-port = 3103
write-grpc-port = 9098
write-memberlist-port = 7949
query-frontend-http-port = 3104
query-frontend-grpc-port = 9099
query-frontend-memberlist-port = 7950
querier-http-port = 3105
querier-grpc-port = 9100
querier-memberlist-port = 7951
index-gateway-http-port = 3106
index-gateway-grpc-port = 9101
index-gateway-memberlist-port = 7952
query-scheduler-http-port = 3107
query-scheduler-grpc-port = 9102
query-scheduler-memberlist-port = 7953
# compactor
nginx-port = 3100
url = http://${:ip}:${:nginx-port}
ipv6 = ${instance-parameter:ipv6-random} ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service]
[loki-service-macro]
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
command-line = command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ loki_bin }} \ {{ loki_bin }} -config.file=${loki-server-config-file:location}
-config.file=${loki-config-file:output} \
\
-boltdb.shipper.compactor.ring.instance-addr=${loki:ip} \
-boltdb.shipper.compactor.ring.instance-id=${:_buildout_section_name_} \
-common.embedded-cachering.instance-addr=${loki:ip} \
-common.embedded-cachering.instance-id=${:_buildout_section_name_} \
-distributor.ring.instance-addr=${loki:ip} \
-distributor.ring.instance-id=${:_buildout_section_name_} \
-frontend.instance-addr=${loki:ip} \
-frontend.instance-port=${loki:query-frontend-grpc-port} \
-index-gateway.ring.instance-addr=${loki:ip} \
-index-gateway.ring.instance-id=${:_buildout_section_name_} \
-memberlist.advertise-port=${:memberlist-port} \
-memberlist.bind-port=${:memberlist-port} \
-memberlist.nodename=${:_buildout_section_name_} \
-query-scheduler.ring.instance-addr=${loki:ip} \
-query-scheduler.ring.instance-id=${:_buildout_section_name_} \
-ruler.ring.instance-addr=${loki:ip} \
-ruler.ring.instance-id=${:_buildout_section_name_} \
-server.grpc-listen-port=${:grpc-port} \
-server.http-listen-port=${:http-port} \
${:extra-command-line}'
wrapper-path = ${directory:service}/${:_buildout_section_name_} wrapper-path = ${directory:service}/${:_buildout_section_name_}
extra-command-line = ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-listen-promise-macro] [loki-server-listen-promise]
<= check-url-available-promise <= check-url-available-promise
url = http://${loki:ip}:${:port}/ready url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
[loki-read-1-service] cert-file = ${loki-promise-client-certificate:cert-file}
<= loki-service-macro key-file = ${loki-promise-client-certificate:key-file}
extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-2-grpc-port} -query-scheduler.ring.instance-port=${loki:read-1-grpc-port}
http-port = ${loki:read-1-http-port}
grpc-port = ${loki:read-1-grpc-port}
memberlist-port = ${loki:read-1-memberlist-port}
[loki-read-1-listen-promise]
<= loki-listen-promise-macro
port = ${loki-read-1-service:http-port}
[loki-read-2-service]
<= loki-service-macro
extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-1-grpc-port} -query-scheduler.ring.instance-port=${loki:read-2-grpc-port}
http-port = ${loki:read-2-http-port}
grpc-port = ${loki:read-2-grpc-port}
memberlist-port = ${loki:read-2-memberlist-port}
[loki-read-2-listen-promise]
<= loki-listen-promise-macro
port = ${loki-read-2-service:http-port}
[loki-write-service]
<= loki-service-macro
extra-command-line = -target=write
http-port = ${loki:write-http-port}
grpc-port = ${loki:write-grpc-port}
memberlist-port = ${loki:write-memberlist-port}
[loki-write-listen-promise]
<= loki-listen-promise-macro
port = ${loki-write-service:http-port}
[loki-querier-service]
<= loki-service-macro
extra-command-line = -target=querier -querier.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port} -query-scheduler.ring.instance-port=${loki:querier-grpc-port}
http-port = ${loki:querier-http-port}
grpc-port = ${loki:querier-grpc-port}
memberlist-port = ${loki:querier-memberlist-port}
[loki-querier-listen-promise]
<= loki-listen-promise-macro
port = ${loki-querier-service:http-port}
[loki-index-gateway-service]
<= loki-service-macro
extra-command-line = -target=index-gateway -boltdb.shipper.query-ready-num-days=30
# XXX -boltdb.shipper.query-ready-num-days=30 useful ?
http-port = ${loki:index-gateway-http-port}
grpc-port = ${loki:index-gateway-grpc-port}
memberlist-port = ${loki:index-gateway-memberlist-port}
[loki-index-gateway-listen-promise]
<= loki-listen-promise-macro
port = ${loki-index-gateway-service:http-port}
[loki-query-frontend-service]
<= loki-service-macro
extra-command-line = -target=query-frontend -frontend.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port}
http-port = ${loki:query-frontend-http-port}
grpc-port = ${loki:query-frontend-grpc-port}
memberlist-port = ${loki:query-frontend-memberlist-port}
[loki-query-frontend-listen-promise]
<= loki-listen-promise-macro
port = ${loki-query-frontend-service:http-port}
[loki-query-scheduler-service]
<= loki-service-macro
extra-command-line = -target=query-scheduler
http-port = ${loki:query-scheduler-http-port}
grpc-port = ${loki:query-scheduler-grpc-port}
memberlist-port = ${loki:query-scheduler-memberlist-port}
[loki-query-scheduler-listen-promise]
<= loki-listen-promise-macro
port = ${loki-query-scheduler-service:http-port}
[loki-config-file]
<= config-file
context =
section loki loki
[loki-nginx-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ nginx_bin }} -p ${directory:loki-nginx-dir} -c ${loki-nginx-config-file:output}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
url = http://${loki:ip}:${loki:nginx-port}
[loki-nginx-listen-promise] [loki-client-certificate]
<= check-url-available-promise key-file = ${directory:etc}/${:_buildout_section_name_}.key
url = ${loki-nginx-service:url} cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
# agent + server
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
# req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
# [ req_ext ]
# subjectAltName = @alt_names
# [ alt_names ]
# IP.1 = ${loki-server:ipv4}
# IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
# agent
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count needs:
# server: loki
# clients: loki promise, grafana, promtail
# TODO: this is bad default
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout['bin-directory'],
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=5,
user_auto_approve_count=1,
key_len=2048,
)}}
[loki-nginx-config-file]
<= config-file
context =
section loki loki
[promtail] [promtail]
recipe = slapos.cookbook:wrapper recipe = slapos.cookbook:wrapper
...@@ -618,8 +926,9 @@ url = http://${:ip}:${:http-port} ...@@ -618,8 +926,9 @@ url = http://${:ip}:${:http-port}
recipe = slapos.recipe.build recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.cfg location = ${directory:etc}/${:_buildout_section_name_}.cfg
slapparameter-dict = ${slap-configuration:configuration} slapparameter-dict = ${slap-configuration:configuration}
install = depends = ${loki-promtail-client-certificate:recipe}
{% raw %} {% raw %}
install =
import os import os
# XXX make extra eggs available to buildout # XXX make extra eggs available to buildout
import zc.buildout import zc.buildout
...@@ -635,6 +944,7 @@ install = ...@@ -635,6 +944,7 @@ install =
import yaml import yaml
slapparameter_dict = self.options['slapparameter-dict'] slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
cfg = { cfg = {
"server": { "server": {
"http_listen_address": self.buildout['promtail']['ip'], "http_listen_address": self.buildout['promtail']['ip'],
...@@ -649,7 +959,14 @@ install = ...@@ -649,7 +959,14 @@ install =
}, },
"clients": [ "clients": [
{ {
"url": "{}/loki/api/v1/push".format(self.buildout['loki']['url']), "url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": self.buildout['loki-server']['ca-file'],
"cert_file": self.buildout['loki-promtail-client-certificate']['cert-file'],
"key_file": self.buildout['loki-promtail-client-certificate']['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s" "batchwait": "5s"
} }
], ],
...@@ -664,7 +981,7 @@ install = ...@@ -664,7 +981,7 @@ install =
def get_static_configs(partition, job_name, path, application): def get_static_configs(partition, job_name, path, application):
directory = '' directory = ''
if partition.get('reference'): if partition.get('reference') and 'instance-root' in path:
directory = os.path.join(application['instance-root'], partition['reference']) directory = os.path.join(application['instance-root'], partition['reference'])
return [ return [
{ {
...@@ -674,13 +991,41 @@ install = ...@@ -674,13 +991,41 @@ install =
"labels": dict( "labels": dict(
partition.get('static-tags', {}), partition.get('static-tags', {}),
job=job_name, job=job_name,
partition=partition['name'],
app=application['name'], app=application['name'],
name=partition['name'],
partition_reference=partition['reference'],
computer_id=slap_connection['computer-id'],
__path__=path.format(directory=directory), __path__=path.format(directory=directory),
) )
} }
] ]
# Add grafana and influxdb own logs. TODO: not in agent mode
cfg['scrape_configs'].append(
{
"job_name": "Grafana",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Grafana", "reference": slap_connection['partition-id']},
"Grafana",
f"{self.buildout['directory']['home']}/.*_grafana*.log",
{"name": "Grafana"}
)
}
)
cfg['scrape_configs'].append(
{
"job_name": "Influxdb",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Influxdb", "reference": slap_connection['partition-id']},
"Influxdb",
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
{"name": "Grafana"},
)
}
)
for application in slapparameter_dict.get('applications', []): for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []): for partition in application.get('partitions', []):
partition.setdefault("type", "default") partition.setdefault("type", "default")
...@@ -790,9 +1135,13 @@ install = ...@@ -790,9 +1135,13 @@ install =
"stages": [ "stages": [
{ {
"multiline": { "multiline": {
# TODO # between each slow query, slow query log has a first line like:
#"firstline": "^# Time: \\d{2}\\d{2}\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}", # # Time: 231008 16:29:01
"firstline": r"^# Time: \d{2}.*", # and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s" "max_wait_time": "3s"
} }
}, },
...@@ -902,14 +1251,14 @@ install = ...@@ -902,14 +1251,14 @@ install =
application, application,
)}) )})
if partition.get('file-path'): if partition.get('log-file-patterns'):
job_name = partition['name'] job_name = partition['name']
cfg['scrape_configs'].append({ cfg['scrape_configs'].append({
"job_name": job_name, "job_name": job_name,
"static_configs": get_static_configs( "static_configs": get_static_configs(
partition, partition,
job_name, job_name,
f"{partition['file-path']}", f"{partition['log-file-patterns']}",
application, application,
)}) )})
...@@ -932,7 +1281,6 @@ name = Grafana Frontend ...@@ -932,7 +1281,6 @@ name = Grafana Frontend
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true shared = true
config-url = ${grafana:url} config-url = ${grafana:url}
config-https-only = true
return = domain secure_access return = domain secure_access
[apache-frontend-available-promise] [apache-frontend-available-promise]
...@@ -946,21 +1294,16 @@ instance-promises = ...@@ -946,21 +1294,16 @@ instance-promises =
${influxdb-listen-promise:path} ${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path} ${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path} ${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path} ${grafana-listen-promise:path}
${loki-query-frontend-listen-promise:path} ${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-query-scheduler-listen-promise:path} ${loki-server-listen-promise:path}
# ${loki-index-gateway-listen-promise:path}
${loki-querier-listen-promise:path}
# ${loki-read-1-listen-promise:path}
# ${loki-read-2-listen-promise:path}
${loki-write-listen-promise:path}
${loki-nginx-listen-promise:path}
${promtail-listen-promise:path} ${promtail-listen-promise:path}
${apache-frontend-available-promise:path} ${apache-frontend-available-promise:path}
[publish-connection-parameter] [publish-connection-parameter]
recipe = slapos.cookbook:publish recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url} influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database} influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username} influxdb-username = ${influxdb:auth-username}
...@@ -969,6 +1312,7 @@ telegraf-extra-config-dir = ${telegraf:extra-config-dir} ...@@ -969,6 +1312,7 @@ telegraf-extra-config-dir = ${telegraf:extra-config-dir}
grafana-url = ${grafana:url} grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user} grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password} grafana-password = ${grafana:admin-password}
loki-url = ${loki:url} loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
promtail-url = ${promtail:url} promtail-url = ${promtail:url}
url = ${apache-frontend:connection-secure_access} url = ${apache-frontend:connection-secure_access}
# insipired from
# https://github.com/grafana/loki/blob/1489c1731277c327e3661da182bfc6c90d4559f4/tools/dev/loki-boltdb-storage-s3/docker-compose.yml
# and othe configuration examples with microservices, because the single binary
# mode assumes running on 127.0.0.1, but in slapos we want to bind on partition's
# addresses
auth_enabled: false
http_prefix:
server:
http_listen_address: {{ loki['ip'] }}
grpc_listen_address: {{ loki['ip'] }}
grpc_server_max_recv_msg_size: 1.048576e+08
grpc_server_max_send_msg_size: 1.048576e+08
# # TODO ?
# wal:
# enabled: true
# dir: /loki/wal
common:
compactor_address: http://{{ loki['ip'] }}:{{ loki['write-http-port'] }}
schema_config:
configs:
- from: 2020-05-15
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: {{ loki['boltdb-shipper-active-index-directory'] }}
cache_location: {{ loki['boltdb-shipper-cache-location'] }}
filesystem:
directory: {{ loki['storage-filesystem-directory'] }}
limits_config:
reject_old_samples: false
enforce_metric_name: false
ingestion_rate_mb: 1024
ingestion_burst_size_mb: 1024
ingester:
lifecycler:
address: {{ loki['ip'] }}
ring:
kvstore:
store: memberlist
replication_factor: 1
compactor:
compaction_interval: 1m
retention_enabled: true
working_directory: {{ loki['compactor-working-directory'] }}
frontend:
log_queries_longer_than: 5s
compress_responses: true
max_outstanding_per_tenant: 2048
tail_proxy_url: http://{{ loki['ip'] }}:{{ loki['querier-http-port']}}
frontend_worker:
scheduler_address: {{ loki['ip'] }}:{{ loki['query-scheduler-grpc-port'] }}
#testERP5Type
memberlist:
bind_addr:
- {{ loki['ip'] }}
join_members:
# - {{ loki['ip'] }}:{{ loki['read-1-memberlist-port'] }}
- {{ loki['ip'] }}:{{ loki['querier-memberlist-port'] }}
# - {{ loki['ip'] }}:{{ loki['write-memberlist-port'] }}
query_scheduler:
max_outstanding_requests_per_tenant: 1024
querier:
query_ingesters_within: 2h
daemon off;
events {
worker_connections 1024;
}
error_log /dev/stdout;
http {
default_type application/octet-stream;
access_log /dev/stdout;
sendfile on;
tcp_nopush on;
upstream read {
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
}
upstream write {
server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
}
upstream cluster {
server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
server {{ loki['ip'] }}:{{ loki['querier-http-port'] }};
}
upstream query-frontend {
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
}
server {
listen {{ loki['ip'] }}:{{ loki['nginx-port'] }};
# XXX while debugging
listen [{{ loki['ipv6'] }}]:{{ loki['nginx-port'] }};
location / {
return 200 'OK';
}
location = /ring {
proxy_pass http://cluster$request_uri;
}
location = /memberlist {
proxy_pass http://cluster$request_uri;
}
location = /config {
proxy_pass http://cluster$request_uri;
}
location = /metrics {
proxy_pass http://cluster$request_uri;
}
location = /ready {
proxy_pass http://cluster$request_uri;
}
location = /loki/api/v1/push {
proxy_pass http://write$request_uri;
}
location = /loki/api/v1/tail {
proxy_pass http://read$request_uri;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
location ~ /loki/api/.* {
proxy_pass http://query-frontend$request_uri;
}
}
}
[buildout] [buildout]
extends = extends =
../../stack/slapos.cfg ../../stack/slapos.cfg
../../stack/caucase/buildout.cfg
../../stack/nodejs.cfg ../../stack/nodejs.cfg
../../component/make/buildout.cfg ../../component/make/buildout.cfg
../../component/golang/buildout.cfg ../../component/golang/buildout.cfg
../../component/openssl/buildout.cfg ../../component/openssl/buildout.cfg
../../component/curl/buildout.cfg ../../component/curl/buildout.cfg
../../component/dash/buildout.cfg ../../component/dash/buildout.cfg
../../component/nginx/buildout.cfg ../../component/jq/buildout.cfg
../../component/systemd/buildout.cfg
../../component/fluent-bit/buildout.cfg
buildout.hash.cfg buildout.hash.cfg
parts = parts =
...@@ -15,21 +18,10 @@ parts = ...@@ -15,21 +18,10 @@ parts =
instance-profile instance-profile
gowork gowork
influxdb-config-file influxdb-config-file
telegraf-config-file
grafana-config-file grafana-config-file
grafana-provisioning-datasources-config-file
grafana-provisioning-dashboards-config-file grafana-provisioning-dashboards-config-file
loki-config-file fluent-bit
loki-nginx-config-file
; [nodejs]
; <= nodejs-16.19.0
[gowork]
# XXX speed up development cycle by not rebuilding workspace on every software run
# XXX does not work ?
update-command =
[go_github.com_grafana_grafana] [go_github.com_grafana_grafana]
<= go-git-package <= go-git-package
...@@ -41,7 +33,7 @@ revision = v10.1.2-0-g8e428858dd ...@@ -41,7 +33,7 @@ revision = v10.1.2-0-g8e428858dd
<= go-git-package <= go-git-package
go.importpath = github.com/grafana/loki go.importpath = github.com/grafana/loki
repository = https://github.com/grafana/loki repository = https://github.com/grafana/loki
revision = v2.9.1-0-gd9d5ed4a1 revision = v3.1.0-0-g935aee77e
[go_github.com_influxdata_influxdb] [go_github.com_influxdata_influxdb]
<= go-git-package <= go-git-package
...@@ -59,7 +51,7 @@ revision = v1.28.1-0-g3ea9ffbe2 ...@@ -59,7 +51,7 @@ revision = v1.28.1-0-g3ea9ffbe2
<= go-git-package <= go-git-package
go.importpath = github.com/perrinjerome/telegraf-input-slapos go.importpath = github.com/perrinjerome/telegraf-input-slapos
repository = https://github.com/perrinjerome/telegraf-input-slapos repository = https://github.com/perrinjerome/telegraf-input-slapos
revision = v0.0.1-0-gf8981f3 revision = v0.0.2-0-gd4c5221
[go_github.com_prometheus_prometheus] [go_github.com_prometheus_prometheus]
<= go-git-package <= go-git-package
...@@ -84,15 +76,18 @@ install = ...@@ -84,15 +76,18 @@ install =
${go_github.com_perrinjerome_slapos_telegraf_input:location}:./... ${go_github.com_perrinjerome_slapos_telegraf_input:location}:./...
${go_github.com_prometheus_prometheus:location}:./cmd/... ${go_github.com_prometheus_prometheus:location}:./cmd/...
# disable cgo, to prevent loki/promtail from using go-systemd
environment = environment =
CGO_ENABLED = 0 CGO_ENABLED=1
CGO_CFLAGS=-I${systemd:location}/include
buildflags =
-tags promtail_journal_enabled
cpkgpath =
${systemd:location}
telegraf-bin = ${:bin}/telegraf telegraf-bin = ${:bin}/telegraf
telegraf-input-slapos-bin = ${:bin}/telegraf-input-slapos telegraf-input-slapos-bin = ${:bin}/telegraf-input-slapos
influx-bin = ${:bin}/influx influx-bin = ${:bin}/influx
influxd-bin = ${:bin}/influxd influxd-bin = ${:bin}/influxd
grafana-bin = ${:bin}/grafana-server grafana-bin = ${grafana:binpath}/grafana
grafana-homepath = ${grafana:homepath} grafana-homepath = ${grafana:homepath}
loki-bin = ${:bin}/loki loki-bin = ${:bin}/loki
promtail-bin = ${:bin}/promtail promtail-bin = ${:bin}/promtail
...@@ -105,8 +100,6 @@ command = bash -ce " ...@@ -105,8 +100,6 @@ command = bash -ce "
. ${gowork:env.sh} && \ . ${gowork:env.sh} && \
go install github.com/google/wire/cmd/wire@v0.5.0 && \ go install github.com/google/wire/cmd/wire@v0.5.0 && \
wire gen -tags oss ./pkg/server ./pkg/cmd/grafana-cli/runner && \ wire gen -tags oss ./pkg/server ./pkg/cmd/grafana-cli/runner && \
# Unlike loki, grafana _needs_ CGO_ENABLED, so we override here
export CGO_ENABLED=1 && \
go run build.go setup && \ go run build.go setup && \
go run build.go build && \ go run build.go build && \
export NODE_OPTIONS=--max_old_space_size=8192 && \ export NODE_OPTIONS=--max_old_space_size=8192 && \
...@@ -119,6 +112,8 @@ command = bash -ce " ...@@ -119,6 +112,8 @@ command = bash -ce "
rm -rf ${buildout:directory}/.cache/yarn/ rm -rf ${buildout:directory}/.cache/yarn/
" "
homepath = ${go_github.com_grafana_grafana:location} homepath = ${go_github.com_grafana_grafana:location}
# XXX "linux-amd64" is not portable here
binpath = ${go_github.com_grafana_grafana:location}/bin/linux-amd64
stop-on-error = true stop-on-error = true
[download-file-base] [download-file-base]
...@@ -128,24 +123,15 @@ url = ${:_profile_base_location_}/${:filename} ...@@ -128,24 +123,15 @@ url = ${:_profile_base_location_}/${:filename}
[influxdb-config-file] [influxdb-config-file]
<= download-file-base <= download-file-base
[telegraf-config-file]
<= download-file-base
[grafana-config-file] [grafana-config-file]
<= download-file-base <= download-file-base
[grafana-provisioning-datasources-config-file]
<= download-file-base
[grafana-provisioning-dashboards-config-file] [grafana-provisioning-dashboards-config-file]
<= download-file-base <= download-file-base
[loki-config-file] [loki-config-file]
<= download-file-base <= download-file-base
[loki-nginx-config-file]
<= download-file-base
[instance-eggs] [instance-eggs]
recipe = zc.recipe.egg recipe = zc.recipe.egg
eggs = eggs =
...@@ -167,13 +153,16 @@ context = ...@@ -167,13 +153,16 @@ context =
key grafana_bin gowork:grafana-bin key grafana_bin gowork:grafana-bin
key grafana_homepath gowork:grafana-homepath key grafana_homepath gowork:grafana-homepath
key loki_bin gowork:loki-bin key loki_bin gowork:loki-bin
raw nginx_bin ${nginx:location}/sbin/nginx
key promtail_bin gowork:promtail-bin key promtail_bin gowork:promtail-bin
key curl_bin :curl-bin key curl_bin :curl-bin
key dash_bin :dash-bin key dash_bin :dash-bin
key jq_bin :jq-bin
curl-bin = ${curl:location}/bin/curl curl-bin = ${curl:location}/bin/curl
dash-bin = ${dash:location}/bin/dash dash-bin = ${dash:location}/bin/dash
depends = ${instance-eggs:eggs} jq-bin = ${jq:location}/bin/jq
depends = ${instance-eggs:eggs} ${caucase-eggs:eggs}
import-list =
file caucase caucase-jinja2-library:target
[versions] [versions]
inotifyx = 0.2.2 inotifyx = 0.2.2
......
{ {
"name": "Grafana", "name": "Grafana",
"description": "Grafana, Telegraf and Influxdb", "description": "Grafana, Influxdb, Loki and Telegraf",
"serialisation": "json-in-xml", "serialisation": "json-in-xml",
"software-type": { "software-type": {
"default": { "default": {
"title": "Default", "title": "Default",
"description": "Grafana, Telegraf and Influxdb in same partition", "description": "Grafana, Influxdb and Loki",
"request": "instance-input-schema.json", "request": "instance-default-input-schema.json",
"response": "instance-output-schema.json", "response": "instance-default-output-schema.json",
"index": 0
},
"agent": {
"title": "Agent",
"description": "Telegraf agent sending metrics to Influxdb and Promtail agent sending logs to Loki",
"request": "instance-agent-input-schema.json",
"response": "instance-agent-output-schema.json",
"index": 0 "index": 0
} }
} }
......
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared plugins.
# Even if a plugin has no configuration, it must be declared in here
# to be active. Declaring a plugin means just specifying the name
# as a section with no variables. To deactivate a plugin, comment
# out the name and any variables.
# Use 'telegraf -config telegraf.toml -test' to see what metrics a config
# file would generate.
# One rule that plugins conform to is wherever a connection string
# can be passed, the values '' and 'localhost' are treated specially.
# They indicate to the plugin to use their own builtin configuration to
# connect to the local system.
# NOTE: The configuration has a few required parameters. They are marked
# with 'required'. Be sure to edit those to make this configuration work.
# Tags can also be specified via a normal map, but only one form at a time:
[tags]
# dc = "us-east-1"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
###############################################################################
# OUTPUTS #
###############################################################################
[outputs]
# Configuration for influxdb server to send metrics to
[outputs.influxdb]
# The full HTTP or UDP endpoint URL for your InfluxDB instance
# Multiple urls can be specified for InfluxDB cluster support.
urls = ["{{ influxdb['url'] }}"]
insecure_skip_verify = true # because we are using a self signed certificate
# The target database for metrics (telegraf will create it if not exists)
database = "{{ influxdb['database'] }}" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
username = "{{ influxdb['auth-username'] }}"
password = "{{ influxdb['auth-password'] }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
###############################################################################
# PLUGINS #
###############################################################################
# Read metrics about cpu usage
[cpu]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["cpu_time"]
# Read metrics about memory usage
[mem]
# no configuration
[disk]
[io]
[system]
{{ extra['extra-config'] }}
###############################################################################
# To add ad-hoc config, don't edit this file directly, but place your config
# files in {{ telegraf['extra-config-dir'] }}
###############################################################################
...@@ -25,14 +25,16 @@ ...@@ -25,14 +25,16 @@
# #
############################################################################## ##############################################################################
from __future__ import unicode_literals import functools
import io import io
import json
import logging import logging
import os import os
import pathlib
import re
import tempfile import tempfile
import textwrap
import time import time
import json import urllib.parse
import psutil import psutil
import requests import requests
...@@ -40,7 +42,6 @@ from six.moves import configparser ...@@ -40,7 +42,6 @@ from six.moves import configparser
from slapos.testing.testcase import makeModuleSetUpAndTestCaseClass from slapos.testing.testcase import makeModuleSetUpAndTestCaseClass
setUpModule, SlapOSInstanceTestCase = makeModuleSetUpAndTestCaseClass( setUpModule, SlapOSInstanceTestCase = makeModuleSetUpAndTestCaseClass(
os.path.abspath( os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', 'software.cfg'))) os.path.join(os.path.dirname(__file__), '..', 'software.cfg')))
...@@ -52,50 +53,72 @@ class GrafanaTestCase(SlapOSInstanceTestCase): ...@@ -52,50 +53,72 @@ class GrafanaTestCase(SlapOSInstanceTestCase):
Since the instances takes time to start and stop, Since the instances takes time to start and stop,
we increase the number of retries. we increase the number of retries.
""" """
instance_max_retry = 50 # instance_max_retry = 50
instance_max_retry = 30 # TODO
report_max_retry = 30 report_max_retry = 30
class TestGrafana(GrafanaTestCase): class TestGrafana(GrafanaTestCase):
def setUp(self): def setUp(self):
self.grafana_url = self.computer_partition.getConnectionParameterDict( self.connection_params = json.loads(
)['grafana-url'] self.computer_partition.getConnectionParameterDict()['_']
)
self.grafana_url = self.connection_params['grafana-url']
def test_grafana_available(self): def test_grafana_available(self):
resp = requests.get(self.grafana_url, verify=False) resp = requests.get(self.grafana_url, verify=False)
self.assertEqual(requests.codes.ok, resp.status_code) self.assertEqual(resp.status_code, requests.codes.ok)
def test_grafana_api(self): def test_grafana_api(self):
# check API is usable # check API is usable
api_org_url = '{self.grafana_url}/api/org'.format(**locals()) api_org_url = f'{self.grafana_url}/api/org'
resp = requests.get(api_org_url, verify=False) resp = requests.get(api_org_url, verify=False)
self.assertEqual(requests.codes.unauthorized, resp.status_code) self.assertEqual(resp.status_code, requests.codes.unauthorized)
connection_params = self.computer_partition.getConnectionParameterDict()
resp = requests.get( resp = requests.get(
api_org_url, api_org_url,
verify=False, verify=False,
auth=requests.auth.HTTPBasicAuth( auth=requests.auth.HTTPBasicAuth(
connection_params['grafana-username'], self.connection_params['grafana-username'],
connection_params['grafana-password'], self.connection_params['grafana-password'],
)) ))
self.assertEqual(requests.codes.ok, resp.status_code) self.assertEqual(resp.status_code, requests.codes.ok)
self.assertEqual(1, resp.json()['id']) self.assertEqual(resp.json()['id'], 1)
def test_grafana_datasource_provisioned(self): def test_grafana_datasource_provisioned(self):
# data sources are provisionned # data sources are provisionned
connection_params = self.computer_partition.getConnectionParameterDict() get = functools.partial(
resp = requests.get( requests.get,
'{self.grafana_url}/api/datasources'.format(**locals()), verify=False,
verify=False, auth=requests.auth.HTTPBasicAuth(
auth=requests.auth.HTTPBasicAuth( self.connection_params['grafana-username'],
connection_params['grafana-username'], self.connection_params['grafana-password'],
connection_params['grafana-password'], )
)) )
self.assertEqual(requests.codes.ok, resp.status_code) datasources_resp = get(f'{self.grafana_url}/api/datasources')
self.assertEqual(datasources_resp.status_code, requests.codes.ok)
self.assertEqual( self.assertEqual(
sorted(['influxdb', 'loki']), sorted([ds['type'] for ds in datasources_resp.json()]),
sorted([ds['type'] for ds in resp.json()])) sorted(['influxdb', 'loki']))
# data sources are usable
# for this we need to wait a bit, because they are only usable once
# some data has been ingested
influxdb, = [ds for ds in datasources_resp.json() if ds['type'] == 'influxdb']
loki, = [ds for ds in datasources_resp.json() if ds['type'] == 'loki']
for retry in range(16):
influxdb_health = get(f'{self.grafana_url}/api/datasources/uid/{influxdb["uid"]}/health').json()
if influxdb_health.get('status') == "OK":
break
time.sleep(retry)
self.assertEqual(influxdb_health['status'], "OK")
for retry in range(16):
loki_health = get(f'{self.grafana_url}/api/datasources/uid/{loki["uid"]}/resources/labels?start={time.time() - 1000}').json()
if loki_health.get('data'):
break
time.sleep(retry)
self.assertEqual(loki_health['status'], "success")
self.assertIn("app", loki_health['data'])
def test_email_disabled(self): def test_email_disabled(self):
config = configparser.ConfigParser() config = configparser.ConfigParser()
...@@ -114,14 +137,14 @@ class TestGrafanaEmailEnabled(GrafanaTestCase): ...@@ -114,14 +137,14 @@ class TestGrafanaEmailEnabled(GrafanaTestCase):
@classmethod @classmethod
def getInstanceParameterDict(cls): def getInstanceParameterDict(cls):
return { return json.dumps({"_": {
"smtp-server": "smtp.example.com:25", "smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username", "smtp-username": "smtp_username",
"smtp-password": "smtp_password", "smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl, 'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com", "email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name", "email-from-name": "Grafana From Name",
} }})
def test_email_enabled(self): def test_email_enabled(self):
config = configparser.ConfigParser() config = configparser.ConfigParser()
...@@ -146,194 +169,158 @@ class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled): ...@@ -146,194 +169,158 @@ class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled):
class TestInfluxDb(GrafanaTestCase): class TestInfluxDb(GrafanaTestCase):
def setUp(self): def setUp(self):
self.influxdb_url = self.computer_partition.getConnectionParameterDict( self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
)['influxdb-url'] self.influxdb_url = self.connection_params['influxdb-url']
def test_influxdb_available(self): def test_influxdb_available(self):
ping_url = '{self.influxdb_url}/ping'.format(**locals()) ping_url = f'{self.influxdb_url}/ping'
resp = requests.get(ping_url, verify=False) resp = requests.get(ping_url, verify=False)
self.assertEqual(requests.codes.no_content, resp.status_code) self.assertEqual(resp.status_code, requests.codes.no_content)
def test_influxdb_api(self): def test_influxdb_api(self):
query_url = '{self.influxdb_url}/query'.format(**locals()) query_url = f'{self.influxdb_url}/query'
connection_params = self.computer_partition.getConnectionParameterDict()
for i in range(10): for i in range(16):
# retry, as it may take a little delay to create databases # retry, as it may take a little delay to create databases
resp = requests.get( resp = requests.get(
query_url, query_url,
verify=False, verify=False,
params=dict( params=dict(
q='SHOW DATABASES', q='SHOW DATABASES',
u=connection_params['influxdb-username'], u=self.connection_params['influxdb-username'],
p=connection_params['influxdb-password'])) p=self.connection_params['influxdb-password']))
self.assertEqual(requests.codes.ok, resp.status_code) self.assertEqual(resp.status_code, requests.codes.ok)
result, = resp.json()['results'] result, = resp.json()['results']
if result['series'] and 'values' in result['series'][0]: if result['series'] and 'values' in result['series'][0]:
break break
time.sleep(0.5 * i) time.sleep(0.5 * i)
self.assertIn( self.assertIn(
[connection_params['influxdb-database']], result['series'][0]['values']) [self.connection_params['influxdb-database']], result['series'][0]['values'])
class TestTelegraf(GrafanaTestCase): class TestTelegraf(GrafanaTestCase):
__partition_reference__ = 'G'
@classmethod
def getInstanceParameterDict(cls):
parameter_dict = {
"agent": {
"applications": [
{
"name": "slapos-standalone-from-test",
"type": "SlapOS",
"instance-root": cls.slap._instance_root,
"partitions": [
{
"name": "test grafana - partition name",
"type": "default",
"reference": "G0"
},
],
},
],
},
}
return {'_': json.dumps(parameter_dict)}
def test_telegraf_running(self): def test_telegraf_running(self):
with self.slap.instance_supervisor_rpc as supervisor: with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo() all_process_info = supervisor.getAllProcessInfo()
process_info, = [p for p in all_process_info if 'telegraf' in p['name']] process_info, = [p for p in all_process_info if 'telegraf' in p['name']]
self.assertEqual('RUNNING', process_info['statename']) self.assertEqual(process_info['statename'], 'RUNNING')
def test_telegraf_ingest_slapos_metrics(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
# wait for data to be ingested
time.sleep(16)
query_url = f'{self.influxdb_url}/query'
query = """
SELECT max("state")
FROM "slapos-standalone-from-test-processes"
WHERE time >= now() - 5m and time <= now()
GROUP BY time(5m),
"partition_reference"::tag,
"name"::tag,
"computer_id"::tag,
"process_name"::tag
fill(null)
"""
get = functools.partial(
requests.get,
verify=False,
params=dict(
q=query,
db=self.connection_params['influxdb-database'],
u=self.connection_params['influxdb-username'],
p=self.connection_params['influxdb-password'],
),
)
for i in range(16):
resp = get(query_url)
if resp.ok and resp.json()['results'][0].get('series'):
break
time.sleep(i)
series = resp.json()['results'][0].get('series')
print(series)
breakpoint()
# hashes and "-on-watch" is removed from process_name
self.asserIn('grafana', [s['tags']['process_name'] for s in series])
self.asserIn('telegraf', [s['tags']['process_name'] for s in series])
self.asserIn('loki-service', [s['tags']['process_name'] for s in series])
self.asserIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
tags = [s['tags'] for s in series][0]
self.assertEqual(tags['name'], 'test grafana - partition name')
self.assertEqual(tags['computer_id'], self.slap._computer_id)
self.assertEqual(
set([s['tags']['partition_reference'] for s in series]),
{'G0'},
)
self.fail('TODO')
class TestLoki(GrafanaTestCase): class TestLoki(GrafanaTestCase):
instance_max_retry = 2
@classmethod @classmethod
def getInstanceParameterDict(cls): def getInstanceParameterDict(cls):
cls._logfile = tempfile.NamedTemporaryFile(suffix='log') cls._logfile = tempfile.NamedTemporaryFile(suffix='log')
cls.addClassCleanup(cls._logfile.close)
parameter_dict = { parameter_dict = {
"agent": {
"applications": [ "applications": [
{ {
"name": "System", "name": "TestLoki",
"instance-root": "/", # "instance-root": "/", # XXX needed ?
"partitions": [ "partitions": [
{ {
# no slapos for system application # no slapos for system application
# XXX example "name": "test log file",
"name": "syslog", "log-file-patterns": cls._logfile.name,
"reference": "syslog", "static-tags": {
"files": [ "testtag": "foo",
"/srv/slapgrid/slappart15/grosgzip/bench.log", },
] },
}, ],
] },
}, ],
{ },
"name": "ERP5",
"instance-root": "/srv/slapgrid/slappart15/srv/runner/instance/",
"urls": [
# TODO
# "https://XXX.host.vifib.net/erp5/",
],
"partitions": [
{
"name": "jerome-dev-mariadb",
"reference": "slappart3",
"type": "erp5/mariadb",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-zodb",
"reference": "slappart4",
"type": "erp5/zeo",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-balancer",
"reference": "slappart6",
"type": "erp5/balancer",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-zope-front",
"reference": "slappart5",
"type": "erp5/zope-front",
#"static-tags": {
# "XXX": "needed?"
#}
},
# {
# "name": "jerome-dev-zope-front",
# "reference": "slappart13",
# "type": "erp5/zope-activity",
# #"static-tags": {
# # "XXX": "needed?"
# #}
# }
]
}
],
# TODO: drop this
'promtail-extra-scrape-config':
textwrap.dedent(r'''
- job_name: {cls.__name__}
pipeline_stages:
- match:
selector: '{{job="{cls.__name__}"}}'
stages:
- multiline:
firstline: '^\d{{4}}-\d{{2}}-\d{{2}}\s\d{{1,2}}\:\d{{2}}\:\d{{2}}\,\d{{3}}'
max_wait_time: 3s
- regex:
expression: '^(?P<timestamp>.*) - (?P<name>\S+) - (?P<level>\S+) - (?P<message>.*)'
- timestamp:
format: 2006-01-02T15:04:05Z00:00
source: timestamp
- labels:
level:
name:
static_configs:
- targets:
- localhost
labels:
job: {cls.__name__}
__path__: {cls._logfile.name}
''').format(**locals())
} }
return {'_': json.dumps(parameter_dict)} return {'_': json.dumps(parameter_dict)}
def xgetInstanceParameterDict(cls):
cls._logfile = tempfile.NamedTemporaryFile(suffix='log')
return {
'promtail-extra-scrape-config':
textwrap.dedent(
r'''
- job_name: {cls.__name__}
pipeline_stages:
- match:
selector: '{{job="{cls.__name__}"}}'
stages:
- multiline:
firstline: '^\d{{4}}-\d{{2}}-\d{{2}}\s\d{{1,2}}\:\d{{2}}\:\d{{2}}\,\d{{3}}'
max_wait_time: 3s
- regex:
expression: '^(?P<timestamp>.*) - (?P<name>\S+) - (?P<level>\S+) - (?P<message>.*)'
- timestamp:
format: 2006-01-02T15:04:05Z00:00
source: timestamp
- labels:
level:
name:
static_configs:
- targets:
- localhost
labels:
job: {cls.__name__}
__path__: {cls._logfile.name}
''').format(**locals())
}
@classmethod
def tearDownClass(cls):
cls._logfile.close()
super(TestLoki, cls).tearDownClass()
def setUp(self): def setUp(self):
self.loki_url = self.computer_partition.getConnectionParameterDict( self.loki_url = json.loads(
self.computer_partition.getConnectionParameterDict()['_']
)['loki-url'] )['loki-url']
def test_loki_available(self): def test_loki_certificate_required(self):
import pdb;pdb; set_trace() with self.assertRaisesRegex(requests.exceptions.SSLError, 'certificate required'):
self.assertEqual( requests.get(f'{self.loki_url}/ready', verify=False)
requests.codes.ok,
requests.get(f'{self.loki_url}/ready',
verify=False).status_code)
def test_log_ingested(self): def test_log_ingested(self):
# create a logger logging to the file that we have # create a logger logging to the file that we have
...@@ -342,68 +329,45 @@ class TestLoki(GrafanaTestCase): ...@@ -342,68 +329,45 @@ class TestLoki(GrafanaTestCase):
test_logger.propagate = False test_logger.propagate = False
test_logger.setLevel(logging.INFO) test_logger.setLevel(logging.INFO)
test_handler = logging.FileHandler(filename=self._logfile.name) test_handler = logging.FileHandler(filename=self._logfile.name)
test_handler.setFormatter(
logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
test_logger.addHandler(test_handler) test_logger.addHandler(test_handler)
test_logger.info("testing message") test_logger.info("testing info message")
test_logger.info("testing another message") partition_root = pathlib.Path(self.computer_partition_root_path)
test_logger.warning("testing warn") get = functools.partial(
# log an exception, which will be multi line in log file. requests.get,
def nested1(): cert=(
def nested2(): partition_root / 'etc' / 'loki-promise-client-certificate.crt',
raise ValueError('boom') partition_root / 'etc' / 'loki-promise-client-certificate.key',
nested2() ),
try: verify=partition_root / 'etc' / 'loki-server-certificate.ca.crt',
nested1() )
except ValueError: url = urllib.parse.urlparse(
test_logger.exception("testing exception") self.loki_url
)._replace(
# Check our messages have been ingested path="/loki/api/v1/query_range",
# we retry a few times, because there's a short delay until messages are query=urllib.parse.urlencode({'query': '{app="TestLoki"} |= ""'}),
# ingested and returned. ).geturl()
for i in range(60): for i in range(16):
resp = requests.get( resp = get(url)
'{self.loki_url}/api/prom/query?query={{job="TestLoki"}}'.format( if resp.ok:
**locals()), if result := resp.json().get('data', {}).get('result', []):
verify=False).json() break
if len(resp.get('streams', [])) < 3: time.sleep(i)
time.sleep(0.5 * i) self.assertEqual(
continue result[0]['stream'],
{
warn_stream_list = [stream for stream in resp['streams'] if 'level="WARNING"' in stream['labels']] 'app': 'TestLoki',
self.assertEqual(1, len(warn_stream_list), resp['streams']) 'detected_level': 'info',
warn_stream, = warn_stream_list 'filename': self._logfile.name,
self.assertIn("testing warn", warn_stream['entries'][0]['line']) 'job': 'test log file',
'partition': 'test log file',
info_stream_list = [stream for stream in resp['streams'] if 'level="INFO"' in stream['labels']] 'service_name': 'TestLoki',
self.assertEqual(1, len(info_stream_list), resp['streams']) 'testtag': 'foo',
info_stream, = info_stream_list }
self.assertTrue( )
[ self.assertEqual(
line for line in info_stream['entries'] [v[1] for v in result[0]['values']],
if "testing message" in line['line'] ['testing info message'])
]) self.assertEqual(len(result), 1)
self.assertTrue(
[
line for line in info_stream['entries']
if "testing another message" in line['line']
])
error_stream_list = [stream for stream in resp['streams'] if 'level="ERROR"' in stream['labels']]
self.assertEqual(1, len(error_stream_list), resp['streams'])
error_stream, = error_stream_list
line, = [line['line'] for line in error_stream['entries']]
# this entry is multi-line
self.assertIn('testing exception\nTraceback (most recent call last):\n', line)
self.assertIn('ValueError: boom', line)
# The labels we have configued are also available
resp = requests.get(
'{self.loki_url}/api/prom/label'.format(**locals()),
verify=False).json()
self.assertIn('level', resp['values'])
self.assertIn('name', resp['values'])
class TestListenInPartition(GrafanaTestCase): class TestListenInPartition(GrafanaTestCase):
...@@ -411,9 +375,18 @@ class TestListenInPartition(GrafanaTestCase): ...@@ -411,9 +375,18 @@ class TestListenInPartition(GrafanaTestCase):
with self.slap.instance_supervisor_rpc as supervisor: with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo() all_process_info = supervisor.getAllProcessInfo()
def canonical_process_name(process):
"""remove hash from hash-files and "on-watch"
"""
return re.sub(
r'-([a-f0-9]{32})$',
'',
process['name'].replace('-on-watch', ''),
)
self.process_dict = { self.process_dict = {
p['name'].replace('-on-watch', ''): psutil.Process(p['pid']) canonical_process_name(p): psutil.Process(p['pid'])
for p in all_process_info if p['name'] != 'watchdog' for p in all_process_info if p['name'] != 'watchdog'
} }
def test_grafana_listen(self): def test_grafana_listen(self):
...@@ -449,11 +422,11 @@ class TestListenInPartition(GrafanaTestCase): ...@@ -449,11 +422,11 @@ class TestListenInPartition(GrafanaTestCase):
def test_loki_listen(self): def test_loki_listen(self):
self.assertEqual( self.assertEqual(
sorted([ sorted([
c.laddr for c in self.process_dict['loki'].connections() c.laddr for c in self.process_dict['loki-service'].connections()
if c.status == 'LISTEN' if c.status == 'LISTEN'
]), ]),
[ [
(self._ipv4_address, 3100), (self.computer_partition_ipv6_address, 3100),
(self._ipv4_address, 9095), (self._ipv4_address, 9095),
], ],
) )
......
...@@ -3,12 +3,16 @@ import argparse ...@@ -3,12 +3,16 @@ import argparse
import json import json
import os.path import os.path
import urllib import urllib
from urlparse import urlparse, urlunparse, ParseResult from urllib.parse import urlparse, urlunparse, ParseResult
import jsonschema import jsonschema
# Adapted from slapos.core.git/slapos/slap/util.py # Adapted from slapos.core.git/slapos/slap/util.py
from lxml import etree
def xml2dict(infile): def xml2dict(infile):
import json
d = json.load(infile)
d.pop('$schema', None)
return d
from lxml import etree
result_dict = {} result_dict = {}
for element in etree.parse(infile).iter(tag=etree.Element): for element in etree.parse(infile).iter(tag=etree.Element):
if element.tag == 'parameter': if element.tag == 'parameter':
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment