Commit f39f1597 authored by Jérome Perrin's avatar Jérome Perrin

WIP grafana

parent 862073fb
......@@ -15,32 +15,17 @@
[instance-profile]
filename = instance.cfg.in
md5sum = 39a1ee09ca7a12995703ff2a6a869637
md5sum = e4d5ac3e6ad239d3bf48c2b3172919b5
[influxdb-config-file]
filename = influxdb-config-file.cfg.in
md5sum = a28972ced3e0f4aa776e43a9c44717c0
[telegraf-config-file]
filename = telegraf-config-file.cfg.in
md5sum = 6de1faa34842e1eda095a51edecc2083
[grafana-config-file]
filename = grafana-config-file.cfg.in
md5sum = 83a8445858eab21a12f1769c23424bea
[grafana-provisioning-datasources-config-file]
filename = grafana-provisioning-datasources-config-file.cfg.in
md5sum = 3aa0f1ed752b2a59ea2b5e7c1733daf3
[grafana-provisioning-dashboards-config-file]
filename = grafana-provisioning-dashboards-config-file.cfg.in
md5sum = 5616679a9c5c2757540175ead3f5500a
[loki-config-file]
filename = loki-config-file.cfg.in
md5sum = 19a7f5cb904b3287b0bc7cb3e8a27429
[loki-nginx-config-file]
filename = loki-nginx-config-file.cfg.in
md5sum = b08ce1e4abb34eb79e26133459c27c3a
# https://grafana.com/docs/administration/provisioning/#example-datasource-config-file
apiVersion: 1
datasources:
- name: telegraf
type: influxdb
access: proxy
url: {{ influxdb['url'] }}
user: {{ influxdb['auth-username'] }}
database: telegraf
isDefault: true
jsonData:
tlsSkipVerify: true
secureJsonData:
password: {{ influxdb['auth-password'] }}
version: 1
editable: false
- name: loki
type: loki
access: proxy
url: {{ loki['url'] }}
version: 1
editable: false
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "Parameters to instantiate Grafana",
"$schema": "http://json-schema.org/draft-04/schema",
"description": "Parameters to instantiate an agent collecting logs and metrics",
"type": "object",
"additionalProperties": false,
"$defs": {
......@@ -42,32 +42,12 @@
]
}
},
"required": [
"applications",
"influxdb",
"loki"
],
"properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "boolean"
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"default": "Grafana",
"type": "string"
},
"applications": {
"description": "Applications to monitor",
"type": "array",
......@@ -107,6 +87,7 @@
"name",
"reference"
],
"additionalProperties": false,
"properties": {
"name": {
"type": "string",
......@@ -174,6 +155,7 @@
},
{
"type": "object",
"additionalProperties": false,
"description": "Configuration for `system` type application",
"required": [
"type",
......@@ -194,6 +176,7 @@
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"name": {
"type": "string",
......@@ -237,6 +220,57 @@
}
]
}
},
"influxdb": {
"description": "Connection information for influxdb",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"database",
"username",
"password"
],
"properties": {
"url": {
"description": "IPv6 URL of influxdb HTTP endpoint",
"format": "uri",
"type": "string"
},
"database": {
"description": "database created in influxdb",
"type": "string"
},
"username": {
"description": "username for influxdb",
"type": "string"
},
"password": {
"description": "password for influxdb user",
"type": "string"
}
}
},
"loki": {
"description": "Connection information for loki",
"type": "object",
"additionalProperties": false,
"required": [
"url",
"caucase-url"
],
"properties": {
"url": {
"description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
}
}
}
}
}
{
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by agent instantiation",
"additionalProperties": false,
"properties": {
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"type": "string"
}
},
"type": "object"
}
{
"$schema": "http://json-schema.org/draft-07/schema",
"description": "Parameters to instantiate Grafana",
"type": "object",
"additionalProperties": false,
"properties": {
"smtp-server": {
"description": "SMTP server used by Grafana to send emails (in host:port format). Leaving this empty will disable email sending.",
"type": "string"
},
"smtp-username": {
"description": "Username to connect to SMTP server",
"type": "string"
},
"smtp-password": {
"description": "Password to connect to SMTP server",
"type": "string"
},
"smtp-verify-ssl": {
"description": "Verify SSL certificate of SMTP server",
"type": "boolean"
},
"email-from-address": {
"description": "Email address used in From: header of emails",
"type": "string"
},
"email-from-name": {
"description": "Name used in From: header of emails",
"default": "Grafana",
"type": "string"
},
"caucase-url": {
"description": "URL of a caucase instance to manage all server and clients certificates",
"type": "string",
"format": "uri"
},
"influxdb": {
"description": "Fine tuning influxdb parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"default-retention-policy-days": {
"description": "Number of days to keep metrics data",
"default": 720,
"type": "integer"
}
}
},
"loki": {
"description": "Fine tuning loki parameters",
"type": "object",
"additionalProperties": false,
"properties": {
"retention-period-days": {
"description": "Number of days to keep log data",
"default": 60,
"type": "integer"
}
}
},
"agent": {
"type": "object",
"properties": {
"applications": {
"$ref": "./instance-agent-input-schema.json#properties/applications"
}
}
}
}
}
{
"$schema": "http://json-schema.org/draft-04/schema#",
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Values returned by Grafana instantiation",
"additionalProperties": false,
"properties": {
"url": {
"description": "Shared frontend for this Grafana instance",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"grafana-username": {
......@@ -18,12 +18,12 @@
},
"grafana-url": {
"description": "IPv6 URL to access grafana",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"influxdb-url": {
"description": "IPv6 URL of influxdb HTTP endpoint",
"pattern": "^https://",
"format": "uri",
"type": "string"
},
"influxdb-database": {
......@@ -38,8 +38,14 @@
"description": "password for influxdb user",
"type": "string"
},
"telegraf-extra-config-dir": {
"description": "Directory in telegraf partition where extra configuration file will be loaded. These files must match *.conf pattern",
"loki-url": {
"description": "Base URL of Loki",
"format": "uri",
"type": "string"
},
"loki-caucase-url": {
"description": "URL caucase service used by Loki",
"format": "uri",
"type": "string"
}
},
......
{% import "caucase" as caucase with context %}
[buildout]
parts =
promises
......@@ -30,6 +32,7 @@ recipe = slapos.cookbook:mkdirectory
home = ${buildout:directory}
etc = ${:home}/etc
var = ${:home}/var
tmp = ${:home}/tmp
srv = ${:home}/srv
service = ${:etc}/service
promise = ${:etc}/promise
......@@ -45,17 +48,19 @@ grafana-dashboards-dir = ${:grafana-dir}/dashboards
telegraf-dir = ${:srv}/telegraf
telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
loki-dir = ${:srv}/loki
loki-boltdb-shipper-active-index-directory = ${:loki-dir}/index
loki-boltdb-shipper-cache-location = ${:loki-dir}/index-cache
loki-compactor-working-directory = ${:loki-dir}/compactor
loki-storage-filesystem-directory = ${:loki-dir}/chunks
loki-nginx-dir = ${:srv}/loki-nginx
loki-nginx-logs-dir = ${:loki-nginx-dir}/logs
loki-compactor-working-directory = ${:loki-dir}/compactor
srv-caucased-loki = ${:srv}/caucased/loki
backup-caucased-loki = ${:srv}/backup/caucased/loki
caucase-updater-loki-server = ${:srv}/caucase-updater/loki-server
caucase-updater-loki-promise-client = ${:srv}/caucase-updater/loki-client-promise
caucase-updater-loki-grafana-client = ${:srv}/caucase-updater/loki-client-grafana
caucase-updater-loki-promtail-client = ${:srv}/caucase-updater/loki-client-promtail
promtail-dir = ${:srv}/promtail
# macros
[generate-certificate]
[generate-insecure-self-signed-certificate]
# TODO: stop using this, use caucase
recipe = plone.recipe.command
command =
if [ ! -e ${:key-file} ]
......@@ -104,7 +109,7 @@ database = telegraf
recipe = slapos.cookbook:wrapper
command-line =
nice -19 chrt --idle 0 ionice -c3 {{ influxd_bin }} -config ${influxdb-config-file:output}
{{ influxd_bin }} -config ${influxdb-config-file:output}
wrapper-path = ${directory:service}/influxdb
[influxdb-config-file]
......@@ -117,7 +122,7 @@ recipe = slapos.cookbook:generate.password
username = influxdb
[influxdb-certificate]
<= generate-certificate
<= generate-insecure-self-signed-certificate
[influxdb-listen-promise]
<= check-port-listening-promise
......@@ -143,6 +148,17 @@ command-line =
-execute 'show databases' | grep '${influxdb:database}'"
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[influxdb-create-defaul-data-retention-policy-promise]
recipe = slapos.cookbook:wrapper
# TODO: actually use parameter
command-line =
{{ influx_bin }}
-username ${influxdb:auth-username}
-password ${influxdb:auth-password}
-socket ${influxdb:unix-socket}
-execute 'CREATE RETENTION POLICY "slapos-default-policy" ON "${influxdb:database}" DURATION 720d REPLICATION 1 DEFAULT'
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[grafana]
ipv6 = ${instance-parameter:ipv6-random}
......@@ -163,17 +179,20 @@ ssl-cert-file = ${grafana-certificate:cert-file}
recipe = slapos.cookbook:wrapper
command-line =
{{ grafana_bin }} -config ${grafana-config-file:output} -homepath {{ grafana_homepath }}
{{ grafana_bin }}
server
-config ${grafana-config-file:output}
-homepath {{ grafana_homepath }}
wrapper-path = ${directory:service}/grafana
hash-existing-files =
${grafana-provisioning-datasources-config-file:location}
[grafana-certificate]
<= generate-certificate
<= generate-insecure-self-signed-certificate
[grafana-password]
# TODO
#recipe = slapos.cookbook:generate.password
recipe = slapos.cookbook:generate.password
username = admin
passwd = admin
[grafana-secret-key]
recipe = slapos.cookbook:generate.password
......@@ -185,15 +204,94 @@ context =
section apache_frontend apache-frontend
key slapparameter_dict slap-configuration:configuration
depends =
${grafana-provisioning-datasources-config-file:output}
${grafana-provisioning-datasources-config-file:location}
${grafana-provisioning-dashboards-config-file:output}
[grafana-provisioning-datasources-config-file]
<= config-file
output = ${grafana:provisioning-datasources-dir}/datasource.yaml
context =
section influxdb influxdb
section loki loki
recipe = slapos.recipe.build
init =
# pre-create location, so that we can use hash-existing-files
import pathlib
datasource_file = pathlib.Path(location)
if not datasource_file.parent.exists():
datasource_file.parent.mkdir(parents=True)
if not datasource_file.exists():
datasource_file.touch()
# make sure this part is reinstalled when certificate is updated
import os
cert_mtime = -1
try:
cert_mtime = (
os.stat(options['loki-grafana-client-certificate-cert-file']).st_mtime
+ os.stat(options['loki-server-certificate-ca-file']).st_mtime
)
except FileNotFoundError:
pass
options['loki-grafana-client-certificate-cert-mtime'] = str(int(cert_mtime))
install =
import json
import os
def safe_read_file(path):
if os.path.exists(path):
with open(path) as f:
return f.read()
influxdb_data_source = {
"name": "telegraf",
"type": "influxdb",
"access": "proxy",
"url": options['influxdb-url'],
"user": options['influxdb-auth-username'],
"database": "telegraf",
"isDefault": True,
"jsonData": {
"tlsSkipVerify": True
},
"secureJsonData": {
"password": options['influxdb-auth-password'],
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False
}
loki_data_source = {
"name": "loki",
"type": "loki",
"access": "proxy",
"url": options['loki-server-url'],
"jsonData": {
"tlsAuth": True,
"tlsAuthWithCACert": True,
"maxLines": 50000,
},
"secureJsonData": {
# XXX maybe we can use file directly ?
# see https://github.com/grafana/grafana/discussions/44296#discussioncomment-2515929
"tlsCACert": safe_read_file(options['loki-server-certificate-ca-file']),
"tlsClientCert": safe_read_file(options['loki-grafana-client-certificate-cert-file']),
"tlsClientKey": safe_read_file(options['loki-grafana-client-certificate-key-file']),
},
"version": int(options['loki-grafana-client-certificate-cert-mtime']),
"editable": False,
}
config = {
"apiVersion": 1,
"datasources": [
influxdb_data_source,
loki_data_source,
],
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
location = ${grafana:provisioning-datasources-dir}/datasources.yaml
loki-server-url = ${loki-server:url}
loki-server-certificate-ca-file = ${loki-server-certificate:ca-file}
loki-grafana-client-certificate-cert-file = ${loki-grafana-client-certificate:cert-file}
loki-grafana-client-certificate-key-file = ${loki-grafana-client-certificate:key-file}
influxdb-url = ${influxdb:url}
influxdb-auth-username = ${influxdb:auth-username}
influxdb-auth-password = ${influxdb:auth-password}
[grafana-provisioning-dashboards-config-file]
<= config-file
......@@ -206,6 +304,15 @@ context =
hostname= ${grafana:ipv6}
port = ${grafana:port}
[grafana-provisioning-datasources-config-file-promise]
recipe = slapos.cookbook:wrapper
command-line =
{{ jq_bin }} -e
"if .datasources[1].secureJsonData.tlsClientCert != null and .datasources[1].secureJsonData.tlsCACert != null then true else false end"
${grafana-provisioning-datasources-config-file:location}
wrapper-path = ${directory:promise}/${:_buildout_section_name_}
[telegraf]
recipe = slapos.cookbook:wrapper
extra-config-dir = ${directory:telegraf-extra-config-dir}
......@@ -215,14 +322,8 @@ command-line =
wrapper-path = ${directory:service}/telegraf
[telegraf-config-file]
<= config-file
context =
section influxdb influxdb
section telegraf telegraf
section extra telegraf-config-file-extra
[telegraf-config-file-extra]
recipe = slapos.recipe.build
output = ${directory:etc}/${:_buildout_section_name_}.toml
telegraf-input-slapos-bin = {{ telegraf_input_slapos_bin }}
slapparameter-dict = ${slap-configuration:configuration}
init =
......@@ -245,13 +346,56 @@ init =
import urllib.parse
import toml
slapparameter_dict = self.options["slapparameter-dict"]
slap_connection = self.buildout["slap-connection"]
influxdb = self.buildout['influxdb']
# files to create during install step
self._config_files = {}
inputs = collections.defaultdict(list)
processors = collections.defaultdict(list)
slapparameter_dict = self.options["slapparameter-dict"]
for application in slapparameter_dict.get("applications", []):
config = {
"agent": {
"debug": False,
"flush_interval": "10s",
"flush_jitter": "0s",
"hostname": "",
"interval": "10s",
"round_interval": True,
},
"tags": {
"computer_id": slap_connection['computer-id'],
},
# built-in inputs
"cpu": {
"drop": ["cpu_time"],
"percpu": True,
"totalcpu": True,
},
"disk": {},
"io": {},
"mem": {},
"system": {},
"inputs": inputs,
"processors": processors,
"outputs": {
"influxdb": {
"database": influxdb["database"],
"insecure_skip_verify": True,
"username": influxdb["auth-username"],
"password": influxdb["auth-password"],
"precision": "s",
"urls": [
influxdb["url"],
],
},
},
}
# v TODO remove agent
for application in slapparameter_dict.get("agent", {}).get("applications", []):
partition_mapping = {}
for partition in application.get("partitions", []):
partition.setdefault("type", "default")
......@@ -267,7 +411,12 @@ init =
"name_override": f"{partition['name']}-mysql",
"servers": [dsn],
"gather_innodb_metrics": True,
"tags": dict(partition.get("static-tags", {}), app=application["name"]),
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
if partition["type"] == "erp5/mariadb":
......@@ -278,39 +427,44 @@ init =
"dsn": dsn,
"query": [
{
"query": "select count(*) as message_count from message",
"query": """
select 'message' as cmf_activity_queue, count(*) as message_count from message
union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
""",
"field_columns_include": ["message_count"],
},
{
"query": "select count(*) as message_queue_count from message_queue",
"field_columns_include": ["message_queue_count"],
},
{
"query": "select count(*) as message_failed_count from message where processing_node=-2",
"field_columns_include": ["message_failed_count"],
},
{
"query": "select count(*) as message_queue_failed_count from message_queue where processing_node=-2",
"field_columns_include": ["message_queue_failed_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as message_waiting_time from message
where processing_node in (-1, 0) and message not like '%after_tag%'
select 'message' as cmf_activity_queue, count(*) as failed_message_count
from message where processing_node between -2 and -10
union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
from message_queue where processing_node between -2 and -10
""",
"field_columns_include": ["message_waiting_time"],
"field_columns_include": ["failed_message_count"],
"tag_columns_include": ["cmf_activity_queue"],
},
{
"query": """
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
as waiting_time, 'message' as cmf_activity_queue
from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
union all
select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
as message_queue_waiting_time from message_queue
where processing_node in (-1, 0) and message not like '%after_tag%'
as waiting_time, 'message_queue' as cmf_activity_queue
from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
""",
"field_columns_include": ["message_queue_waiting_time"],
}
"field_columns_include": ["waiting_time"],
"tag_columns_include": ["cmf_activity_queue"],
},
],
"tags": dict(partition.get("static-tags", {}), app=application["name"]),
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
......@@ -326,7 +480,12 @@ init =
],
"grok_timezone": "Local",
"name_override": f"{partition['name']}",
"tags": dict(partition.get("static-tags", {}), app=application["name"]),
"tags": dict(
partition.get("static-tags", {}),
app=application["name"],
name=partition["name"],
partition_reference=partition["reference"],
),
}
)
urls = application.get("urls", [])
......@@ -344,12 +503,13 @@ init =
# x509_cert wants a port
if not parsed_url.port:
x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
inputs["x509_cert"].append({
"sources": [x509_url],
"tags": {"url": url},
"interval": "5h",
"tags": {"app": application["name"]},
})
# TODO some kind of GET request every X minutes ?
if application.get("type") == "SlapOS":
telegraf_slapos_input_config_file = os.path.join(
......@@ -360,65 +520,69 @@ init =
"slapos": [{
"instance_root": application["instance-root"]}]}})
# TODO: supervisor process finder for
# https://github.com/influxdata/telegraf/tree/master/plugins/inputs/procstat ?
telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
inputs["execd"].append({
"name_override": f"{application['name']}-processes",
"command": [telegraf_slapos_input_command, '-config', telegraf_slapos_input_config_file],
"tags": {"app": application["name"]},
})
# "cleanup" slapos process names, remove hash from wrappers and -on-watch suffix
processors["regex"].append({
# drop measurements for not monitored partitions.
processors["starlark"].append({
"namepass": [f"{application['name']}-processes"],
"order": 1,
"source": f'''
def apply(metric):
if metric.tags.get('reference') in {list(partition_mapping)!r}:
return metric
'''
})
# telegraf-input-slapos outputs the process name as "name", but we rename
# this to "process_name", so that it is more understandable in a global
# context and because we use the name of the partition as "name" everywhere
# else.
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 2,
"replace": [{
"tag": "name",
"dest": "process_name",
}]})
# "normalize" slapos process names, remove hash from hash-files and -on-watch suffix
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 3,
"tags": [{
"key": "name",
"pattern": "^(.*)-.{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"key": "process_name",
"pattern": "^(.*)-on-watch$",
"replacement": "$" + "{1}",
}]})
processors["regex"].append({
"namepass": [f"{application['name']}-processes"],
"order": 2,
"order": 4,
"tags": [{
"key": "name",
"pattern": "^(.*)-on-watch$",
"key": "process_name",
"pattern": "^(.*)-\\w{32}",
# XXX we concatenate strings so that we don't have to escape them for buildout
"replacement": "$" + "{1}",
}]})
# use consistent `partition_reference` for slappart
processors["rename"].append({
"namepass": [f"{application['name']}-processes"],
"order": 5,
"replace": [{
"tag": "reference",
"dest": "partition_reference",
}]})
processors["enum"].append({
"namepass": [ f"{application['name']}-processes"],
"order": 6,
"mapping": [{
# "tag": "group", # TODO: rename this in input plugin # XXX I don't remember what this means
"tag": "slappart",
"dest": "partition",
"tag": "partition_reference",
"dest": "name",
"value_mappings": partition_mapping,
}]})
# TODOs:
# - [ ] slapos input
# - [x] friendly name of slappart
# - [x] strip hashes from -on-watch
# - [x] activity metrics
# - [ ] alert dashboard
# - [ ] inclu "jerome-dev" partout ???
# - [ ] apdex
# - [ ] "job" is bad name in Explore
options["extra-config"] = toml.dumps({
"inputs": inputs,
"processors": processors})
# import pdb; pdb.set_trace()
# apdex
# SELECT sum("success") / sum("all") FROM
# (SELECT count("duration") AS "all" FROM "jerome-dev-balancer" WHERE $timeFilter GROUP BY time($__interval) fill(null)),
# (SELECT count("duration") AS "success" FROM "jerome-dev-balancer" WHERE ("resp_code" = '200' ) AND $timeFilter GROUP BY time($__interval) fill(null))
#SELECT sum("success") + sum("all") FROM
# (SELECT count("duration") AS "all" FROM "jerome-dev-balancer" WHERE $timeFilter GROUP BY time($__interval) fill(0)),
# (SELECT count("duration") AS "success" FROM "jerome-dev-balancer" WHERE ("resp_code" = '200' ) AND $timeFilter GROUP BY time($__interval) fill(0))
self._config_files[options['output']] = toml.dumps(config)
install =
import os
......@@ -427,180 +591,324 @@ install =
with open(fname, 'w') as f:
f.write(content)
[loki]
boltdb-shipper-active-index-directory = ${directory:loki-boltdb-shipper-active-index-directory}
boltdb-shipper-cache-location = ${directory:loki-boltdb-shipper-cache-location}
compactor-working-directory = ${directory:loki-compactor-working-directory}
[loki-server]
storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
compactor-working-directory = ${directory:loki-compactor-working-directory}
path-prefix = ${directory:loki-dir}
ip = ${instance-parameter:ipv4-random}
read-1-http-port = 3101
read-1-grpc-port = 9096
read-1-memberlist-port = 7947
read-2-http-port = 3102
read-2-grpc-port = 9097
read-2-memberlist-port = 7948
write-http-port = 3103
write-grpc-port = 9098
write-memberlist-port = 7949
query-frontend-http-port = 3104
query-frontend-grpc-port = 9099
query-frontend-memberlist-port = 7950
querier-http-port = 3105
querier-grpc-port = 9100
querier-memberlist-port = 7951
index-gateway-http-port = 3106
index-gateway-grpc-port = 9101
index-gateway-memberlist-port = 7952
query-scheduler-http-port = 3107
query-scheduler-grpc-port = 9102
query-scheduler-memberlist-port = 7953
# compactor
nginx-port = 3100
url = http://${:ip}:${:nginx-port}
http-port = 3100
url = https://[${:ipv6}]:${:http-port}
ipv4 = ${instance-parameter:ipv4-random}
ipv6 = ${instance-parameter:ipv6-random}
ca-file = ${loki-server-certificate:ca-file}
cert-file = ${loki-server-certificate:cert-file}
key-file = ${loki-server-certificate:key-file}
# TODO: CRL
[loki-service-macro]
[loki-service]
recipe = slapos.cookbook:wrapper
command-line =
bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ loki_bin }} \
-config.file=${loki-config-file:output} \
\
-boltdb.shipper.compactor.ring.instance-addr=${loki:ip} \
-boltdb.shipper.compactor.ring.instance-id=${:_buildout_section_name_} \
-common.embedded-cachering.instance-addr=${loki:ip} \
-common.embedded-cachering.instance-id=${:_buildout_section_name_} \
-distributor.ring.instance-addr=${loki:ip} \
-distributor.ring.instance-id=${:_buildout_section_name_} \
-frontend.instance-addr=${loki:ip} \
-frontend.instance-port=${loki:query-frontend-grpc-port} \
-index-gateway.ring.instance-addr=${loki:ip} \
-index-gateway.ring.instance-id=${:_buildout_section_name_} \
-memberlist.advertise-port=${:memberlist-port} \
-memberlist.bind-port=${:memberlist-port} \
-memberlist.nodename=${:_buildout_section_name_} \
-query-scheduler.ring.instance-addr=${loki:ip} \
-query-scheduler.ring.instance-id=${:_buildout_section_name_} \
-ruler.ring.instance-addr=${loki:ip} \
-ruler.ring.instance-id=${:_buildout_section_name_} \
-server.grpc-listen-port=${:grpc-port} \
-server.http-listen-port=${:http-port} \
${:extra-command-line}'
{{ loki_bin }} -config.file=${loki-server-config-file:location}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
extra-command-line =
ready-url = ${loki-server:url}/ready
hash-files =
${loki-server-config-file:location}
hash-existing-files =
${loki-server-certificate:cert-file}
[loki-server-config-file]
location = ${directory:etc}/${:_buildout_section_name_}.yaml
recipe = slapos.recipe.build
install =
import json
loki_server = self.buildout['loki-server']
slapparameter_dict = self.buildout['slap-configuration']['configuration']
config = {
"auth_enabled": False,
"server": {
"http_listen_address": loki_server['ipv6'],
"http_listen_port": int(loki_server['http-port']),
"http_tls_config": {
"client_ca_file": loki_server['ca-file'],
"cert_file": loki_server['cert-file'],
"key_file": loki_server['key-file'],
"client_auth_type": "RequireAndVerifyClientCert",
},
"grpc_listen_address": loki_server['ipv4'],
"grpc_server_max_recv_msg_size": 104857600,
"grpc_server_max_send_msg_size": 104857600
},
"common": {
"instance_addr": loki_server['ipv4'],
"replication_factor": 1,
"ring": {
"instance_addr": loki_server['ipv4'],
"kvstore": {
"store": "inmemory"
}
},
"path_prefix": loki_server['path-prefix'],
},
"schema_config": {
"configs": [
{
"from": "2020-05-15",
"store": "tsdb",
"object_store": "filesystem",
"schema": "v13",
"index": {
"prefix": "index_",
"period": "24h"
}
}
]
},
"storage_config": {
"filesystem": {
"directory": loki_server['storage-filesystem-directory'],
}
},
"limits_config": {
"ingestion_rate_mb": 1024,
"ingestion_burst_size_mb": 1024,
"max_entries_limit_per_query": 50001,
"reject_old_samples": False,
"retention_period": '{}d'.format(
slapparameter_dict.get('loki', {}).get('retention-period-days', 60))
},
"frontend_worker": {
"grpc_client_config": {
# TODO check needed
# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
"max_send_msg_size": 268435456
}
},
"compactor": {
"working_directory": loki_server['compactor-working-directory'],
"delete_request_store": "filesystem",
"retention_enabled": True,
"retention_delete_delay": "2h",
}
}
with open(options['location'], 'w') as f:
json.dump(config, f, indent=2)
[loki-server-certificate-init-certificate]
recipe = slapos.recipe.build
init =
# pre-create a file at the path of the certificate,
# so that we can use hash-existing-files options
import pathlib
cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
if not cert_file.parent.exists():
cert_file.parent.mkdir()
if not cert_file.exists():
cert_file.touch()
[loki-server-certificate]
init = ${loki-server-certificate-init-certificate:init}
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
{{
caucase.updater(
prefix='loki-server-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-server-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-server}',
crt_path='${loki-server-certificate:cert-file}',
ca_path='${loki-server-certificate:ca-file}',
crl_path='${loki-server-certificate:crl-file}',
key_path='${loki-server-certificate:key-file}',
template_csr='${loki-server-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-server-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = loki-server
[ req_ext ]
subjectAltName = @alt_names
[ alt_names ]
IP.1 = ${loki-server:ipv4}
IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-server-certificate-prepare-csr]
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${loki-server-certificate-csr-config:output}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-listen-promise-macro]
[loki-server-listen-promise]
<= check-url-available-promise
url = http://${loki:ip}:${:port}/ready
[loki-read-1-service]
<= loki-service-macro
extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-2-grpc-port} -query-scheduler.ring.instance-port=${loki:read-1-grpc-port}
http-port = ${loki:read-1-http-port}
grpc-port = ${loki:read-1-grpc-port}
memberlist-port = ${loki:read-1-memberlist-port}
[loki-read-1-listen-promise]
<= loki-listen-promise-macro
port = ${loki-read-1-service:http-port}
[loki-read-2-service]
<= loki-service-macro
extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-1-grpc-port} -query-scheduler.ring.instance-port=${loki:read-2-grpc-port}
http-port = ${loki:read-2-http-port}
grpc-port = ${loki:read-2-grpc-port}
memberlist-port = ${loki:read-2-memberlist-port}
[loki-read-2-listen-promise]
<= loki-listen-promise-macro
port = ${loki-read-2-service:http-port}
[loki-write-service]
<= loki-service-macro
extra-command-line = -target=write
http-port = ${loki:write-http-port}
grpc-port = ${loki:write-grpc-port}
memberlist-port = ${loki:write-memberlist-port}
[loki-write-listen-promise]
<= loki-listen-promise-macro
port = ${loki-write-service:http-port}
[loki-querier-service]
<= loki-service-macro
extra-command-line = -target=querier -querier.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port} -query-scheduler.ring.instance-port=${loki:querier-grpc-port}
http-port = ${loki:querier-http-port}
grpc-port = ${loki:querier-grpc-port}
memberlist-port = ${loki:querier-memberlist-port}
[loki-querier-listen-promise]
<= loki-listen-promise-macro
port = ${loki-querier-service:http-port}
[loki-index-gateway-service]
<= loki-service-macro
extra-command-line = -target=index-gateway -boltdb.shipper.query-ready-num-days=30
# XXX -boltdb.shipper.query-ready-num-days=30 useful ?
http-port = ${loki:index-gateway-http-port}
grpc-port = ${loki:index-gateway-grpc-port}
memberlist-port = ${loki:index-gateway-memberlist-port}
[loki-index-gateway-listen-promise]
<= loki-listen-promise-macro
port = ${loki-index-gateway-service:http-port}
[loki-query-frontend-service]
<= loki-service-macro
extra-command-line = -target=query-frontend -frontend.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port}
http-port = ${loki:query-frontend-http-port}
grpc-port = ${loki:query-frontend-grpc-port}
memberlist-port = ${loki:query-frontend-memberlist-port}
[loki-query-frontend-listen-promise]
<= loki-listen-promise-macro
port = ${loki-query-frontend-service:http-port}
[loki-query-scheduler-service]
<= loki-service-macro
extra-command-line = -target=query-scheduler
http-port = ${loki:query-scheduler-http-port}
grpc-port = ${loki:query-scheduler-grpc-port}
memberlist-port = ${loki:query-scheduler-memberlist-port}
[loki-query-scheduler-listen-promise]
<= loki-listen-promise-macro
port = ${loki-query-scheduler-service:http-port}
[loki-config-file]
<= config-file
context =
section loki loki
url = ${loki-service:ready-url}
ca-cert-file = ${loki-server:ca-file}
cert-file = ${loki-promise-client-certificate:cert-file}
key-file = ${loki-promise-client-certificate:key-file}
[loki-nginx-service]
recipe = slapos.cookbook:wrapper
command-line =
{{ nginx_bin }} -p ${directory:loki-nginx-dir} -c ${loki-nginx-config-file:output}
wrapper-path = ${directory:service}/${:_buildout_section_name_}
url = http://${loki:ip}:${loki:nginx-port}
[loki-nginx-listen-promise]
<= check-url-available-promise
url = ${loki-nginx-service:url}
[loki-client-certificate]
key-file = ${directory:etc}/${:_buildout_section_name_}.key
cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
common-name = ${:_buildout_section_name_}
ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
# agent + server
[loki-client-certificate-csr-config]
recipe = slapos.recipe.template
inline =
[req]
prompt = no
# req_extensions = req_ext
distinguished_name = dn
[ dn ]
CN = ${:_buildout_section_name_}
# [ req_ext ]
# subjectAltName = @alt_names
# [ alt_names ]
# IP.1 = ${loki-server:ipv4}
# IP.2 = ${loki-server:ipv6}
output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
[loki-client-certificate-prepare-csr]
# variable
config =
recipe = plone.recipe.command
command =
if [ ! -f '${:csr}' ] ; then
{{ openssl_bin }} req \
-newkey rsa \
-batch \
-new \
-sha256 \
-nodes \
-keyout /dev/null \
-config '${:config}' \
-out '${:csr}'
fi
stop-on-error = true
csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem
[loki-promise-client-certificate]
<= loki-client-certificate
[loki-promise-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promise-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promise-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promise-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promise-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promise-client}',
crt_path='${loki-promise-client-certificate:cert-file}',
ca_path='${loki-promise-client-certificate:ca-file}',
crl_path='${loki-promise-client-certificate:crl-file}',
key_path='${loki-promise-client-certificate:key-file}',
template_csr='${loki-promise-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-grafana-client-certificate]
<= loki-client-certificate
[loki-grafana-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-grafana-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-grafana-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-grafana-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-grafana-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-grafana-client}',
crt_path='${loki-grafana-client-certificate:cert-file}',
ca_path='${loki-grafana-client-certificate:ca-file}',
crl_path='${loki-grafana-client-certificate:crl-file}',
key_path='${loki-grafana-client-certificate:key-file}',
template_csr='${loki-grafana-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
# agent
[loki-promtail-client-certificate]
<= loki-client-certificate
[loki-promtail-client-certificate-csr-config]
<= loki-client-certificate-csr-config
[loki-promtail-client-certificate-prepare-csr]
<= loki-client-certificate-prepare-csr
config = ${loki-promtail-client-certificate-csr-config:output}
{{
caucase.updater(
prefix='loki-promtail-client-certificate',
buildout_bin_directory=buildout['bin-directory'],
updater_path='${directory:service}/loki-promtail-client-certificate-updater',
url='${loki-caucased:url}',
data_dir='${directory:caucase-updater-loki-promtail-client}',
crt_path='${loki-promtail-client-certificate:cert-file}',
ca_path='${loki-promtail-client-certificate:ca-file}',
crl_path='${loki-promtail-client-certificate:crl-file}',
key_path='${loki-promtail-client-certificate:key-file}',
template_csr='${loki-promtail-client-certificate-prepare-csr:csr}',
openssl=openssl_bin,
)}}
[loki-caucased]
port = 18080
ip = ${instance-parameter:ipv6-random}
netloc = [${:ip}]:${:port}
url = http://${:netloc}/
# service_auto_approve_count needs:
# server: loki
# clients: loki promise, grafana, promtail
# TODO: this is bad default
{{
caucase.caucased(
prefix='loki-caucased',
buildout_bin_directory=buildout['bin-directory'],
caucased_path='${directory:service}/loki-caucased',
backup_dir='${directory:backup-caucased-loki}',
data_dir='${directory:srv-caucased-loki}',
netloc='${loki-caucased:netloc}',
tmp='${directory:tmp}',
service_auto_approve_count=5,
user_auto_approve_count=1,
key_len=2048,
)}}
[loki-nginx-config-file]
<= config-file
context =
section loki loki
[promtail]
recipe = slapos.cookbook:wrapper
......@@ -618,8 +926,9 @@ url = http://${:ip}:${:http-port}
recipe = slapos.recipe.build
location = ${directory:etc}/${:_buildout_section_name_}.cfg
slapparameter-dict = ${slap-configuration:configuration}
install =
depends = ${loki-promtail-client-certificate:recipe}
{% raw %}
install =
import os
# XXX make extra eggs available to buildout
import zc.buildout
......@@ -635,6 +944,7 @@ install =
import yaml
slapparameter_dict = self.options['slapparameter-dict']
slap_connection = self.buildout["slap-connection"]
cfg = {
"server": {
"http_listen_address": self.buildout['promtail']['ip'],
......@@ -649,7 +959,14 @@ install =
},
"clients": [
{
"url": "{}/loki/api/v1/push".format(self.buildout['loki']['url']),
"url": "{}/loki/api/v1/push".format(self.buildout['loki-server']['url']),
"tls_config": {
"ca_file": self.buildout['loki-server']['ca-file'],
"cert_file": self.buildout['loki-promtail-client-certificate']['cert-file'],
"key_file": self.buildout['loki-promtail-client-certificate']['key-file'],
},
# this might not be good for copytruncate option of logrotate
# see https://grafana.com/docs/loki/latest/send-data/promtail/logrotation/
"batchwait": "5s"
}
],
......@@ -664,7 +981,7 @@ install =
def get_static_configs(partition, job_name, path, application):
directory = ''
if partition.get('reference'):
if partition.get('reference') and 'instance-root' in path:
directory = os.path.join(application['instance-root'], partition['reference'])
return [
{
......@@ -674,13 +991,41 @@ install =
"labels": dict(
partition.get('static-tags', {}),
job=job_name,
partition=partition['name'],
app=application['name'],
name=partition['name'],
partition_reference=partition['reference'],
computer_id=slap_connection['computer-id'],
__path__=path.format(directory=directory),
)
}
]
# Add grafana and influxdb own logs. TODO: not in agent mode
cfg['scrape_configs'].append(
{
"job_name": "Grafana",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Grafana", "reference": slap_connection['partition-id']},
"Grafana",
f"{self.buildout['directory']['home']}/.*_grafana*.log",
{"name": "Grafana"}
)
}
)
cfg['scrape_configs'].append(
{
"job_name": "Influxdb",
"pipeline_stages": [],
"static_configs": get_static_configs(
{"name": "Influxdb", "reference": slap_connection['partition-id']},
"Influxdb",
f"{self.buildout['directory']['home']}/.*_influxdb*.log",
{"name": "Grafana"},
)
}
)
for application in slapparameter_dict.get('applications', []):
for partition in application.get('partitions', []):
partition.setdefault("type", "default")
......@@ -790,9 +1135,13 @@ install =
"stages": [
{
"multiline": {
# TODO
#"firstline": "^# Time: \\d{2}\\d{2}\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}",
"firstline": r"^# Time: \d{2}.*",
# between each slow query, slow query log has a first line like:
# # Time: 231008 16:29:01
# and then a second like:
# # User@Host: user[user] @ [10.0.71.207]
# but the first line is not repeated for subsequent queries that happens
# at the same second
"firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
"max_wait_time": "3s"
}
},
......@@ -902,14 +1251,14 @@ install =
application,
)})
if partition.get('file-path'):
if partition.get('log-file-patterns'):
job_name = partition['name']
cfg['scrape_configs'].append({
"job_name": job_name,
"static_configs": get_static_configs(
partition,
job_name,
f"{partition['file-path']}",
f"{partition['log-file-patterns']}",
application,
)})
......@@ -932,7 +1281,6 @@ name = Grafana Frontend
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
shared = true
config-url = ${grafana:url}
config-https-only = true
return = domain secure_access
[apache-frontend-available-promise]
......@@ -946,21 +1294,16 @@ instance-promises =
${influxdb-listen-promise:path}
${influxdb-password-promise:wrapper-path}
${influxdb-database-ready-promise:wrapper-path}
${influxdb-create-defaul-data-retention-policy-promise:wrapper-path}
${grafana-listen-promise:path}
${loki-query-frontend-listen-promise:path}
${loki-query-scheduler-listen-promise:path}
# ${loki-index-gateway-listen-promise:path}
${loki-querier-listen-promise:path}
# ${loki-read-1-listen-promise:path}
# ${loki-read-2-listen-promise:path}
${loki-write-listen-promise:path}
${loki-nginx-listen-promise:path}
${grafana-provisioning-datasources-config-file-promise:wrapper-path}
${loki-server-listen-promise:path}
${promtail-listen-promise:path}
${apache-frontend-available-promise:path}
[publish-connection-parameter]
recipe = slapos.cookbook:publish
recipe = slapos.cookbook:publish.serialised
influxdb-url = ${influxdb:url}
influxdb-database = ${influxdb:database}
influxdb-username = ${influxdb:auth-username}
......@@ -969,6 +1312,7 @@ telegraf-extra-config-dir = ${telegraf:extra-config-dir}
grafana-url = ${grafana:url}
grafana-username = ${grafana:admin-user}
grafana-password = ${grafana:admin-password}
loki-url = ${loki:url}
loki-url = ${loki-server:url}
loki-caucase-url = ${loki-caucased:url}
promtail-url = ${promtail:url}
url = ${apache-frontend:connection-secure_access}
# insipired from
# https://github.com/grafana/loki/blob/1489c1731277c327e3661da182bfc6c90d4559f4/tools/dev/loki-boltdb-storage-s3/docker-compose.yml
# and othe configuration examples with microservices, because the single binary
# mode assumes running on 127.0.0.1, but in slapos we want to bind on partition's
# addresses
auth_enabled: false
http_prefix:
server:
http_listen_address: {{ loki['ip'] }}
grpc_listen_address: {{ loki['ip'] }}
grpc_server_max_recv_msg_size: 1.048576e+08
grpc_server_max_send_msg_size: 1.048576e+08
# # TODO ?
# wal:
# enabled: true
# dir: /loki/wal
common:
compactor_address: http://{{ loki['ip'] }}:{{ loki['write-http-port'] }}
schema_config:
configs:
- from: 2020-05-15
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: {{ loki['boltdb-shipper-active-index-directory'] }}
cache_location: {{ loki['boltdb-shipper-cache-location'] }}
filesystem:
directory: {{ loki['storage-filesystem-directory'] }}
limits_config:
reject_old_samples: false
enforce_metric_name: false
ingestion_rate_mb: 1024
ingestion_burst_size_mb: 1024
ingester:
lifecycler:
address: {{ loki['ip'] }}
ring:
kvstore:
store: memberlist
replication_factor: 1
compactor:
compaction_interval: 1m
retention_enabled: true
working_directory: {{ loki['compactor-working-directory'] }}
frontend:
log_queries_longer_than: 5s
compress_responses: true
max_outstanding_per_tenant: 2048
tail_proxy_url: http://{{ loki['ip'] }}:{{ loki['querier-http-port']}}
frontend_worker:
scheduler_address: {{ loki['ip'] }}:{{ loki['query-scheduler-grpc-port'] }}
#testERP5Type
memberlist:
bind_addr:
- {{ loki['ip'] }}
join_members:
# - {{ loki['ip'] }}:{{ loki['read-1-memberlist-port'] }}
- {{ loki['ip'] }}:{{ loki['querier-memberlist-port'] }}
# - {{ loki['ip'] }}:{{ loki['write-memberlist-port'] }}
query_scheduler:
max_outstanding_requests_per_tenant: 1024
querier:
query_ingesters_within: 2h
daemon off;
events {
worker_connections 1024;
}
error_log /dev/stdout;
http {
default_type application/octet-stream;
access_log /dev/stdout;
sendfile on;
tcp_nopush on;
upstream read {
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
}
upstream write {
server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
}
upstream cluster {
server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
server {{ loki['ip'] }}:{{ loki['querier-http-port'] }};
}
upstream query-frontend {
server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
}
server {
listen {{ loki['ip'] }}:{{ loki['nginx-port'] }};
# XXX while debugging
listen [{{ loki['ipv6'] }}]:{{ loki['nginx-port'] }};
location / {
return 200 'OK';
}
location = /ring {
proxy_pass http://cluster$request_uri;
}
location = /memberlist {
proxy_pass http://cluster$request_uri;
}
location = /config {
proxy_pass http://cluster$request_uri;
}
location = /metrics {
proxy_pass http://cluster$request_uri;
}
location = /ready {
proxy_pass http://cluster$request_uri;
}
location = /loki/api/v1/push {
proxy_pass http://write$request_uri;
}
location = /loki/api/v1/tail {
proxy_pass http://read$request_uri;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
}
location ~ /loki/api/.* {
proxy_pass http://query-frontend$request_uri;
}
}
}
[buildout]
extends =
../../stack/slapos.cfg
../../stack/caucase/buildout.cfg
../../stack/nodejs.cfg
../../component/make/buildout.cfg
../../component/golang/buildout.cfg
../../component/openssl/buildout.cfg
../../component/curl/buildout.cfg
../../component/dash/buildout.cfg
../../component/nginx/buildout.cfg
../../component/jq/buildout.cfg
../../component/systemd/buildout.cfg
../../component/fluent-bit/buildout.cfg
buildout.hash.cfg
parts =
......@@ -15,21 +18,10 @@ parts =
instance-profile
gowork
influxdb-config-file
telegraf-config-file
grafana-config-file
grafana-provisioning-datasources-config-file
grafana-provisioning-dashboards-config-file
loki-config-file
loki-nginx-config-file
fluent-bit
; [nodejs]
; <= nodejs-16.19.0
[gowork]
# XXX speed up development cycle by not rebuilding workspace on every software run
# XXX does not work ?
update-command =
[go_github.com_grafana_grafana]
<= go-git-package
......@@ -41,7 +33,7 @@ revision = v10.1.2-0-g8e428858dd
<= go-git-package
go.importpath = github.com/grafana/loki
repository = https://github.com/grafana/loki
revision = v2.9.1-0-gd9d5ed4a1
revision = v3.1.0-0-g935aee77e
[go_github.com_influxdata_influxdb]
<= go-git-package
......@@ -59,7 +51,7 @@ revision = v1.28.1-0-g3ea9ffbe2
<= go-git-package
go.importpath = github.com/perrinjerome/telegraf-input-slapos
repository = https://github.com/perrinjerome/telegraf-input-slapos
revision = v0.0.1-0-gf8981f3
revision = v0.0.2-0-gd4c5221
[go_github.com_prometheus_prometheus]
<= go-git-package
......@@ -84,15 +76,18 @@ install =
${go_github.com_perrinjerome_slapos_telegraf_input:location}:./...
${go_github.com_prometheus_prometheus:location}:./cmd/...
# disable cgo, to prevent loki/promtail from using go-systemd
environment =
CGO_ENABLED = 0
CGO_ENABLED=1
CGO_CFLAGS=-I${systemd:location}/include
buildflags =
-tags promtail_journal_enabled
cpkgpath =
${systemd:location}
telegraf-bin = ${:bin}/telegraf
telegraf-input-slapos-bin = ${:bin}/telegraf-input-slapos
influx-bin = ${:bin}/influx
influxd-bin = ${:bin}/influxd
grafana-bin = ${:bin}/grafana-server
grafana-bin = ${grafana:binpath}/grafana
grafana-homepath = ${grafana:homepath}
loki-bin = ${:bin}/loki
promtail-bin = ${:bin}/promtail
......@@ -105,8 +100,6 @@ command = bash -ce "
. ${gowork:env.sh} && \
go install github.com/google/wire/cmd/wire@v0.5.0 && \
wire gen -tags oss ./pkg/server ./pkg/cmd/grafana-cli/runner && \
# Unlike loki, grafana _needs_ CGO_ENABLED, so we override here
export CGO_ENABLED=1 && \
go run build.go setup && \
go run build.go build && \
export NODE_OPTIONS=--max_old_space_size=8192 && \
......@@ -119,6 +112,8 @@ command = bash -ce "
rm -rf ${buildout:directory}/.cache/yarn/
"
homepath = ${go_github.com_grafana_grafana:location}
# XXX "linux-amd64" is not portable here
binpath = ${go_github.com_grafana_grafana:location}/bin/linux-amd64
stop-on-error = true
[download-file-base]
......@@ -128,24 +123,15 @@ url = ${:_profile_base_location_}/${:filename}
[influxdb-config-file]
<= download-file-base
[telegraf-config-file]
<= download-file-base
[grafana-config-file]
<= download-file-base
[grafana-provisioning-datasources-config-file]
<= download-file-base
[grafana-provisioning-dashboards-config-file]
<= download-file-base
[loki-config-file]
<= download-file-base
[loki-nginx-config-file]
<= download-file-base
[instance-eggs]
recipe = zc.recipe.egg
eggs =
......@@ -167,13 +153,16 @@ context =
key grafana_bin gowork:grafana-bin
key grafana_homepath gowork:grafana-homepath
key loki_bin gowork:loki-bin
raw nginx_bin ${nginx:location}/sbin/nginx
key promtail_bin gowork:promtail-bin
key curl_bin :curl-bin
key dash_bin :dash-bin
key jq_bin :jq-bin
curl-bin = ${curl:location}/bin/curl
dash-bin = ${dash:location}/bin/dash
depends = ${instance-eggs:eggs}
jq-bin = ${jq:location}/bin/jq
depends = ${instance-eggs:eggs} ${caucase-eggs:eggs}
import-list =
file caucase caucase-jinja2-library:target
[versions]
inotifyx = 0.2.2
......
{
"name": "Grafana",
"description": "Grafana, Telegraf and Influxdb",
"description": "Grafana, Influxdb, Loki and Telegraf",
"serialisation": "json-in-xml",
"software-type": {
"default": {
"title": "Default",
"description": "Grafana, Telegraf and Influxdb in same partition",
"request": "instance-input-schema.json",
"response": "instance-output-schema.json",
"description": "Grafana, Influxdb and Loki",
"request": "instance-default-input-schema.json",
"response": "instance-default-output-schema.json",
"index": 0
},
"agent": {
"title": "Agent",
"description": "Telegraf agent sending metrics to Influxdb and Promtail agent sending logs to Loki",
"request": "instance-agent-input-schema.json",
"response": "instance-agent-output-schema.json",
"index": 0
}
}
......
# Telegraf configuration
# Telegraf is entirely plugin driven. All metrics are gathered from the
# declared plugins.
# Even if a plugin has no configuration, it must be declared in here
# to be active. Declaring a plugin means just specifying the name
# as a section with no variables. To deactivate a plugin, comment
# out the name and any variables.
# Use 'telegraf -config telegraf.toml -test' to see what metrics a config
# file would generate.
# One rule that plugins conform to is wherever a connection string
# can be passed, the values '' and 'localhost' are treated specially.
# They indicate to the plugin to use their own builtin configuration to
# connect to the local system.
# NOTE: The configuration has a few required parameters. They are marked
# with 'required'. Be sure to edit those to make this configuration work.
# Tags can also be specified via a normal map, but only one form at a time:
[tags]
# dc = "us-east-1"
# Configuration for telegraf agent
[agent]
# Default data collection interval for all plugins
interval = "10s"
# Rounds collection interval to 'interval'
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
# Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s"
# Jitter the flush interval by a random amount. This is primarily to avoid
# large write spikes for users running a large number of telegraf instances.
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
flush_jitter = "0s"
# Run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
###############################################################################
# OUTPUTS #
###############################################################################
[outputs]
# Configuration for influxdb server to send metrics to
[outputs.influxdb]
# The full HTTP or UDP endpoint URL for your InfluxDB instance
# Multiple urls can be specified for InfluxDB cluster support.
urls = ["{{ influxdb['url'] }}"]
insecure_skip_verify = true # because we are using a self signed certificate
# The target database for metrics (telegraf will create it if not exists)
database = "{{ influxdb['database'] }}" # required
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
username = "{{ influxdb['auth-username'] }}"
password = "{{ influxdb['auth-password'] }}"
# Set the user agent for HTTP POSTs (can be useful for log differentiation)
# user_agent = "telegraf"
# Set UDP payload size, defaults to InfluxDB UDP Client default (512 bytes)
# udp_payload = 512
###############################################################################
# PLUGINS #
###############################################################################
# Read metrics about cpu usage
[cpu]
# Whether to report per-cpu stats or not
percpu = true
# Whether to report total system cpu stats or not
totalcpu = true
# Comment this line if you want the raw CPU time metrics
drop = ["cpu_time"]
# Read metrics about memory usage
[mem]
# no configuration
[disk]
[io]
[system]
{{ extra['extra-config'] }}
###############################################################################
# To add ad-hoc config, don't edit this file directly, but place your config
# files in {{ telegraf['extra-config-dir'] }}
###############################################################################
......@@ -25,14 +25,16 @@
#
##############################################################################
from __future__ import unicode_literals
import functools
import io
import json
import logging
import os
import pathlib
import re
import tempfile
import textwrap
import time
import json
import urllib.parse
import psutil
import requests
......@@ -40,7 +42,6 @@ from six.moves import configparser
from slapos.testing.testcase import makeModuleSetUpAndTestCaseClass
setUpModule, SlapOSInstanceTestCase = makeModuleSetUpAndTestCaseClass(
os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', 'software.cfg')))
......@@ -52,50 +53,72 @@ class GrafanaTestCase(SlapOSInstanceTestCase):
Since the instances takes time to start and stop,
we increase the number of retries.
"""
instance_max_retry = 50
# instance_max_retry = 50
instance_max_retry = 30 # TODO
report_max_retry = 30
class TestGrafana(GrafanaTestCase):
def setUp(self):
self.grafana_url = self.computer_partition.getConnectionParameterDict(
)['grafana-url']
self.connection_params = json.loads(
self.computer_partition.getConnectionParameterDict()['_']
)
self.grafana_url = self.connection_params['grafana-url']
def test_grafana_available(self):
resp = requests.get(self.grafana_url, verify=False)
self.assertEqual(requests.codes.ok, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.ok)
def test_grafana_api(self):
# check API is usable
api_org_url = '{self.grafana_url}/api/org'.format(**locals())
api_org_url = f'{self.grafana_url}/api/org'
resp = requests.get(api_org_url, verify=False)
self.assertEqual(requests.codes.unauthorized, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.unauthorized)
connection_params = self.computer_partition.getConnectionParameterDict()
resp = requests.get(
api_org_url,
verify=False,
auth=requests.auth.HTTPBasicAuth(
connection_params['grafana-username'],
connection_params['grafana-password'],
))
self.assertEqual(requests.codes.ok, resp.status_code)
self.assertEqual(1, resp.json()['id'])
api_org_url,
verify=False,
auth=requests.auth.HTTPBasicAuth(
self.connection_params['grafana-username'],
self.connection_params['grafana-password'],
))
self.assertEqual(resp.status_code, requests.codes.ok)
self.assertEqual(resp.json()['id'], 1)
def test_grafana_datasource_provisioned(self):
# data sources are provisionned
connection_params = self.computer_partition.getConnectionParameterDict()
resp = requests.get(
'{self.grafana_url}/api/datasources'.format(**locals()),
verify=False,
auth=requests.auth.HTTPBasicAuth(
connection_params['grafana-username'],
connection_params['grafana-password'],
))
self.assertEqual(requests.codes.ok, resp.status_code)
get = functools.partial(
requests.get,
verify=False,
auth=requests.auth.HTTPBasicAuth(
self.connection_params['grafana-username'],
self.connection_params['grafana-password'],
)
)
datasources_resp = get(f'{self.grafana_url}/api/datasources')
self.assertEqual(datasources_resp.status_code, requests.codes.ok)
self.assertEqual(
sorted(['influxdb', 'loki']),
sorted([ds['type'] for ds in resp.json()]))
sorted([ds['type'] for ds in datasources_resp.json()]),
sorted(['influxdb', 'loki']))
# data sources are usable
# for this we need to wait a bit, because they are only usable once
# some data has been ingested
influxdb, = [ds for ds in datasources_resp.json() if ds['type'] == 'influxdb']
loki, = [ds for ds in datasources_resp.json() if ds['type'] == 'loki']
for retry in range(16):
influxdb_health = get(f'{self.grafana_url}/api/datasources/uid/{influxdb["uid"]}/health').json()
if influxdb_health.get('status') == "OK":
break
time.sleep(retry)
self.assertEqual(influxdb_health['status'], "OK")
for retry in range(16):
loki_health = get(f'{self.grafana_url}/api/datasources/uid/{loki["uid"]}/resources/labels?start={time.time() - 1000}').json()
if loki_health.get('data'):
break
time.sleep(retry)
self.assertEqual(loki_health['status'], "success")
self.assertIn("app", loki_health['data'])
def test_email_disabled(self):
config = configparser.ConfigParser()
......@@ -114,14 +137,14 @@ class TestGrafanaEmailEnabled(GrafanaTestCase):
@classmethod
def getInstanceParameterDict(cls):
return {
"smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username",
"smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name",
}
return json.dumps({"_": {
"smtp-server": "smtp.example.com:25",
"smtp-username": "smtp_username",
"smtp-password": "smtp_password",
'smtp-verify-ssl': cls.smtp_verify_ssl,
"email-from-address": "grafana@example.com",
"email-from-name": "Grafana From Name",
}})
def test_email_enabled(self):
config = configparser.ConfigParser()
......@@ -146,194 +169,158 @@ class TestGrafanaEmailEnabledSkipVerify(TestGrafanaEmailEnabled):
class TestInfluxDb(GrafanaTestCase):
def setUp(self):
self.influxdb_url = self.computer_partition.getConnectionParameterDict(
)['influxdb-url']
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
def test_influxdb_available(self):
ping_url = '{self.influxdb_url}/ping'.format(**locals())
ping_url = f'{self.influxdb_url}/ping'
resp = requests.get(ping_url, verify=False)
self.assertEqual(requests.codes.no_content, resp.status_code)
self.assertEqual(resp.status_code, requests.codes.no_content)
def test_influxdb_api(self):
query_url = '{self.influxdb_url}/query'.format(**locals())
connection_params = self.computer_partition.getConnectionParameterDict()
query_url = f'{self.influxdb_url}/query'
for i in range(10):
for i in range(16):
# retry, as it may take a little delay to create databases
resp = requests.get(
query_url,
verify=False,
params=dict(
q='SHOW DATABASES',
u=connection_params['influxdb-username'],
p=connection_params['influxdb-password']))
self.assertEqual(requests.codes.ok, resp.status_code)
query_url,
verify=False,
params=dict(
q='SHOW DATABASES',
u=self.connection_params['influxdb-username'],
p=self.connection_params['influxdb-password']))
self.assertEqual(resp.status_code, requests.codes.ok)
result, = resp.json()['results']
if result['series'] and 'values' in result['series'][0]:
break
time.sleep(0.5 * i)
self.assertIn(
[connection_params['influxdb-database']], result['series'][0]['values'])
[self.connection_params['influxdb-database']], result['series'][0]['values'])
class TestTelegraf(GrafanaTestCase):
__partition_reference__ = 'G'
@classmethod
def getInstanceParameterDict(cls):
parameter_dict = {
"agent": {
"applications": [
{
"name": "slapos-standalone-from-test",
"type": "SlapOS",
"instance-root": cls.slap._instance_root,
"partitions": [
{
"name": "test grafana - partition name",
"type": "default",
"reference": "G0"
},
],
},
],
},
}
return {'_': json.dumps(parameter_dict)}
def test_telegraf_running(self):
with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo()
process_info, = [p for p in all_process_info if 'telegraf' in p['name']]
self.assertEqual('RUNNING', process_info['statename'])
self.assertEqual(process_info['statename'], 'RUNNING')
def test_telegraf_ingest_slapos_metrics(self):
self.connection_params = json.loads(self.computer_partition.getConnectionParameterDict()['_'])
self.influxdb_url = self.connection_params['influxdb-url']
# wait for data to be ingested
time.sleep(16)
query_url = f'{self.influxdb_url}/query'
query = """
SELECT max("state")
FROM "slapos-standalone-from-test-processes"
WHERE time >= now() - 5m and time <= now()
GROUP BY time(5m),
"partition_reference"::tag,
"name"::tag,
"computer_id"::tag,
"process_name"::tag
fill(null)
"""
get = functools.partial(
requests.get,
verify=False,
params=dict(
q=query,
db=self.connection_params['influxdb-database'],
u=self.connection_params['influxdb-username'],
p=self.connection_params['influxdb-password'],
),
)
for i in range(16):
resp = get(query_url)
if resp.ok and resp.json()['results'][0].get('series'):
break
time.sleep(i)
series = resp.json()['results'][0].get('series')
print(series)
breakpoint()
# hashes and "-on-watch" is removed from process_name
self.asserIn('grafana', [s['tags']['process_name'] for s in series])
self.asserIn('telegraf', [s['tags']['process_name'] for s in series])
self.asserIn('loki-service', [s['tags']['process_name'] for s in series])
self.asserIn('loki-grafana-client-certificate-updater', [s['tags']['process_name'] for s in series])
tags = [s['tags'] for s in series][0]
self.assertEqual(tags['name'], 'test grafana - partition name')
self.assertEqual(tags['computer_id'], self.slap._computer_id)
self.assertEqual(
set([s['tags']['partition_reference'] for s in series]),
{'G0'},
)
self.fail('TODO')
class TestLoki(GrafanaTestCase):
instance_max_retry = 2
@classmethod
def getInstanceParameterDict(cls):
cls._logfile = tempfile.NamedTemporaryFile(suffix='log')
cls.addClassCleanup(cls._logfile.close)
parameter_dict = {
"agent": {
"applications": [
{
"name": "System",
"instance-root": "/",
"partitions": [
{
# no slapos for system application
# XXX example
"name": "syslog",
"reference": "syslog",
"files": [
"/srv/slapgrid/slappart15/grosgzip/bench.log",
]
},
]
},
{
"name": "ERP5",
"instance-root": "/srv/slapgrid/slappart15/srv/runner/instance/",
"urls": [
# TODO
# "https://XXX.host.vifib.net/erp5/",
],
"partitions": [
{
"name": "jerome-dev-mariadb",
"reference": "slappart3",
"type": "erp5/mariadb",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-zodb",
"reference": "slappart4",
"type": "erp5/zeo",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-balancer",
"reference": "slappart6",
"type": "erp5/balancer",
#"static-tags": {
# "XXX": "needed?"
#}
},
{
"name": "jerome-dev-zope-front",
"reference": "slappart5",
"type": "erp5/zope-front",
#"static-tags": {
# "XXX": "needed?"
#}
},
# {
# "name": "jerome-dev-zope-front",
# "reference": "slappart13",
# "type": "erp5/zope-activity",
# #"static-tags": {
# # "XXX": "needed?"
# #}
# }
]
}
],
# TODO: drop this
'promtail-extra-scrape-config':
textwrap.dedent(r'''
- job_name: {cls.__name__}
pipeline_stages:
- match:
selector: '{{job="{cls.__name__}"}}'
stages:
- multiline:
firstline: '^\d{{4}}-\d{{2}}-\d{{2}}\s\d{{1,2}}\:\d{{2}}\:\d{{2}}\,\d{{3}}'
max_wait_time: 3s
- regex:
expression: '^(?P<timestamp>.*) - (?P<name>\S+) - (?P<level>\S+) - (?P<message>.*)'
- timestamp:
format: 2006-01-02T15:04:05Z00:00
source: timestamp
- labels:
level:
name:
static_configs:
- targets:
- localhost
labels:
job: {cls.__name__}
__path__: {cls._logfile.name}
''').format(**locals())
{
"name": "TestLoki",
# "instance-root": "/", # XXX needed ?
"partitions": [
{
# no slapos for system application
"name": "test log file",
"log-file-patterns": cls._logfile.name,
"static-tags": {
"testtag": "foo",
},
},
],
},
],
},
}
return {'_': json.dumps(parameter_dict)}
def xgetInstanceParameterDict(cls):
cls._logfile = tempfile.NamedTemporaryFile(suffix='log')
return {
'promtail-extra-scrape-config':
textwrap.dedent(
r'''
- job_name: {cls.__name__}
pipeline_stages:
- match:
selector: '{{job="{cls.__name__}"}}'
stages:
- multiline:
firstline: '^\d{{4}}-\d{{2}}-\d{{2}}\s\d{{1,2}}\:\d{{2}}\:\d{{2}}\,\d{{3}}'
max_wait_time: 3s
- regex:
expression: '^(?P<timestamp>.*) - (?P<name>\S+) - (?P<level>\S+) - (?P<message>.*)'
- timestamp:
format: 2006-01-02T15:04:05Z00:00
source: timestamp
- labels:
level:
name:
static_configs:
- targets:
- localhost
labels:
job: {cls.__name__}
__path__: {cls._logfile.name}
''').format(**locals())
}
@classmethod
def tearDownClass(cls):
cls._logfile.close()
super(TestLoki, cls).tearDownClass()
def setUp(self):
self.loki_url = self.computer_partition.getConnectionParameterDict(
self.loki_url = json.loads(
self.computer_partition.getConnectionParameterDict()['_']
)['loki-url']
def test_loki_available(self):
import pdb;pdb; set_trace()
self.assertEqual(
requests.codes.ok,
requests.get(f'{self.loki_url}/ready',
verify=False).status_code)
def test_loki_certificate_required(self):
with self.assertRaisesRegex(requests.exceptions.SSLError, 'certificate required'):
requests.get(f'{self.loki_url}/ready', verify=False)
def test_log_ingested(self):
# create a logger logging to the file that we have
......@@ -342,68 +329,45 @@ class TestLoki(GrafanaTestCase):
test_logger.propagate = False
test_logger.setLevel(logging.INFO)
test_handler = logging.FileHandler(filename=self._logfile.name)
test_handler.setFormatter(
logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
test_logger.addHandler(test_handler)
test_logger.info("testing message")
test_logger.info("testing another message")
test_logger.warning("testing warn")
# log an exception, which will be multi line in log file.
def nested1():
def nested2():
raise ValueError('boom')
nested2()
try:
nested1()
except ValueError:
test_logger.exception("testing exception")
# Check our messages have been ingested
# we retry a few times, because there's a short delay until messages are
# ingested and returned.
for i in range(60):
resp = requests.get(
'{self.loki_url}/api/prom/query?query={{job="TestLoki"}}'.format(
**locals()),
verify=False).json()
if len(resp.get('streams', [])) < 3:
time.sleep(0.5 * i)
continue
warn_stream_list = [stream for stream in resp['streams'] if 'level="WARNING"' in stream['labels']]
self.assertEqual(1, len(warn_stream_list), resp['streams'])
warn_stream, = warn_stream_list
self.assertIn("testing warn", warn_stream['entries'][0]['line'])
info_stream_list = [stream for stream in resp['streams'] if 'level="INFO"' in stream['labels']]
self.assertEqual(1, len(info_stream_list), resp['streams'])
info_stream, = info_stream_list
self.assertTrue(
[
line for line in info_stream['entries']
if "testing message" in line['line']
])
self.assertTrue(
[
line for line in info_stream['entries']
if "testing another message" in line['line']
])
error_stream_list = [stream for stream in resp['streams'] if 'level="ERROR"' in stream['labels']]
self.assertEqual(1, len(error_stream_list), resp['streams'])
error_stream, = error_stream_list
line, = [line['line'] for line in error_stream['entries']]
# this entry is multi-line
self.assertIn('testing exception\nTraceback (most recent call last):\n', line)
self.assertIn('ValueError: boom', line)
# The labels we have configued are also available
resp = requests.get(
'{self.loki_url}/api/prom/label'.format(**locals()),
verify=False).json()
self.assertIn('level', resp['values'])
self.assertIn('name', resp['values'])
test_logger.info("testing info message")
partition_root = pathlib.Path(self.computer_partition_root_path)
get = functools.partial(
requests.get,
cert=(
partition_root / 'etc' / 'loki-promise-client-certificate.crt',
partition_root / 'etc' / 'loki-promise-client-certificate.key',
),
verify=partition_root / 'etc' / 'loki-server-certificate.ca.crt',
)
url = urllib.parse.urlparse(
self.loki_url
)._replace(
path="/loki/api/v1/query_range",
query=urllib.parse.urlencode({'query': '{app="TestLoki"} |= ""'}),
).geturl()
for i in range(16):
resp = get(url)
if resp.ok:
if result := resp.json().get('data', {}).get('result', []):
break
time.sleep(i)
self.assertEqual(
result[0]['stream'],
{
'app': 'TestLoki',
'detected_level': 'info',
'filename': self._logfile.name,
'job': 'test log file',
'partition': 'test log file',
'service_name': 'TestLoki',
'testtag': 'foo',
}
)
self.assertEqual(
[v[1] for v in result[0]['values']],
['testing info message'])
self.assertEqual(len(result), 1)
class TestListenInPartition(GrafanaTestCase):
......@@ -411,9 +375,18 @@ class TestListenInPartition(GrafanaTestCase):
with self.slap.instance_supervisor_rpc as supervisor:
all_process_info = supervisor.getAllProcessInfo()
def canonical_process_name(process):
"""remove hash from hash-files and "on-watch"
"""
return re.sub(
r'-([a-f0-9]{32})$',
'',
process['name'].replace('-on-watch', ''),
)
self.process_dict = {
p['name'].replace('-on-watch', ''): psutil.Process(p['pid'])
for p in all_process_info if p['name'] != 'watchdog'
canonical_process_name(p): psutil.Process(p['pid'])
for p in all_process_info if p['name'] != 'watchdog'
}
def test_grafana_listen(self):
......@@ -449,11 +422,11 @@ class TestListenInPartition(GrafanaTestCase):
def test_loki_listen(self):
self.assertEqual(
sorted([
c.laddr for c in self.process_dict['loki'].connections()
c.laddr for c in self.process_dict['loki-service'].connections()
if c.status == 'LISTEN'
]),
[
(self._ipv4_address, 3100),
(self.computer_partition_ipv6_address, 3100),
(self._ipv4_address, 9095),
],
)
......
......@@ -3,12 +3,16 @@ import argparse
import json
import os.path
import urllib
from urlparse import urlparse, urlunparse, ParseResult
from urllib.parse import urlparse, urlunparse, ParseResult
import jsonschema
# Adapted from slapos.core.git/slapos/slap/util.py
from lxml import etree
def xml2dict(infile):
import json
d = json.load(infile)
d.pop('$schema', None)
return d
from lxml import etree
result_dict = {}
for element in etree.parse(infile).iter(tag=etree.Element):
if element.tag == 'parameter':
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment