WIP grafana

6ec46ee4 · Jérome Perrin · 4c4f04ce · 6ec46ee4 · 6ec46ee4 · 6ec46ee4
Commit 6ec46ee4 authored Sep 27, 2023 by Jérome Perrin
6 changed files
--- a/software/grafana/buildout.hash.cfg
+++ b/software/grafana/buildout.hash.cfg
@@ -15,7 +15,7 @@

 [instance-profile]
 filename = instance.cfg.in
-md5sum = 39a1ee09ca7a12995703ff2a6a869637
+md5sum = bdc556fa76a3f5a763391797c7f3e342

 [influxdb-config-file]
 filename = influxdb-config-file.cfg.in
@@ -23,7 +23,7 @@ md5sum = a28972ced3e0f4aa776e43a9c44717c0

 [telegraf-config-file]
 filename = telegraf-config-file.cfg.in
-md5sum = 6de1faa34842e1eda095a51edecc2083
+md5sum = 016d0163ca3dbabe538a8feeee745c60

 [grafana-config-file]
 filename = grafana-config-file.cfg.in
@@ -39,8 +39,4 @@ md5sum = 5616679a9c5c2757540175ead3f5500a

 [loki-config-file]
 filename = loki-config-file.cfg.in
-md5sum = 19a7f5cb904b3287b0bc7cb3e8a27429
-
-[loki-nginx-config-file]
-filename = loki-nginx-config-file.cfg.in
-md5sum = b08ce1e4abb34eb79e26133459c27c3a
+md5sum = e4917bae2a07598ce5305520e9a58b6d
--- a/software/grafana/instance.cfg.in
+++ b/software/grafana/instance.cfg.in
+{% import "caucase" as caucase with context %}
+
 [buildout]
 parts =
  promises
@@ -45,17 +47,13 @@ grafana-dashboards-dir = ${:grafana-dir}/dashboards
 telegraf-dir = ${:srv}/telegraf
 telegraf-extra-config-dir = ${:telegraf-dir}/extra-config
 loki-dir = ${:srv}/loki
-loki-boltdb-shipper-active-index-directory = ${:loki-dir}/index
-loki-boltdb-shipper-cache-location = ${:loki-dir}/index-cache
-loki-compactor-working-directory = ${:loki-dir}/compactor
 loki-storage-filesystem-directory = ${:loki-dir}/chunks
-loki-nginx-dir = ${:srv}/loki-nginx
-loki-nginx-logs-dir = ${:loki-nginx-dir}/logs
-
+caucase-updater-loki = ${:srv}/caucase-updater/loki/
 promtail-dir = ${:srv}/promtail

 # macros
-[generate-certificate]
+[generate-insecure-self-signed-certificate]
+# TODO: stop using this, use caucase
 recipe = plone.recipe.command
 command =
  if [ ! -e ${:key-file} ]
@@ -117,7 +115,7 @@ recipe = slapos.cookbook:generate.password
 username = influxdb

 [influxdb-certificate]
-<= generate-certificate
+<= generate-insecure-self-signed-certificate

 [influxdb-listen-promise]
 <= check-port-listening-promise
@@ -163,17 +161,15 @@ ssl-cert-file = ${grafana-certificate:cert-file}

 recipe = slapos.cookbook:wrapper
 command-line =
-    {{ grafana_bin }} -config ${grafana-config-file:output} -homepath {{ grafana_homepath }}
+    {{ grafana_bin }} server -config ${grafana-config-file:output} -homepath {{ grafana_homepath }}
 wrapper-path = ${directory:service}/grafana

 [grafana-certificate]
-<= generate-certificate
+<= generate-insecure-self-signed-certificate

 [grafana-password]
-# TODO
-#recipe = slapos.cookbook:generate.password
+recipe = slapos.cookbook:generate.password
 username = admin
-passwd = admin

 [grafana-secret-key]
 recipe = slapos.cookbook:generate.password
@@ -220,6 +216,7 @@ context =
  section influxdb influxdb
  section telegraf telegraf
  section extra telegraf-config-file-extra
+  section slap_configuration slap-configuration

 [telegraf-config-file-extra]
 recipe = slapos.recipe.build
@@ -267,7 +264,12 @@ init =
            "name_override": f"{partition['name']}-mysql",
            "servers": [dsn],
            "gather_innodb_metrics": True,
-            "tags": dict(partition.get("static-tags", {}), app=application["name"]),
+            "tags": dict(
+              partition.get("static-tags", {}),
+              app=application["name"],
+              name=partition["name"],
+              partition=partition["reference"],
+            ),
          }
        )
        if partition["type"] == "erp5/mariadb":
@@ -278,39 +280,44 @@ init =
              "dsn": dsn,
              "query": [
                {
-                  "query": "select count(*) as message_count from message",
+                  "query": """
+                    select 'message' as cmf_activity_queue, count(*) as message_count from message
+                    union all select 'message_queue' as cmf_activity_queue, count(*) as message_count from message_queue
+                  """,
                  "field_columns_include": ["message_count"],
-                },
-                {
-                  "query": "select count(*) as message_queue_count from message_queue",
-                  "field_columns_include": ["message_queue_count"],
-                },
-                {
-                  "query": "select count(*) as message_failed_count from message where processing_node=-2",
-                  "field_columns_include": ["message_failed_count"],
-                },
-                {
-                  "query": "select count(*) as message_queue_failed_count from message_queue where processing_node=-2",
-                  "field_columns_include": ["message_queue_failed_count"],
+                  "tag_columns_include": ["cmf_activity_queue"],
                },
                {
                  "query": """
-                    select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
-                      as message_waiting_time from message
-                      where processing_node in (-1, 0) and message not like '%after_tag%'
+                    select 'message' as cmf_activity_queue, count(*) as failed_message_count
+                      from message where processing_node between -2 and -10
+                    union all select 'message_queue' as cmf_activity_queue, count(*) as failed_message_count
+                      from message_queue where processing_node between -2 and -10
                  """,
-                  "field_columns_include": ["message_waiting_time"],
+                  "field_columns_include": ["failed_message_count"],
+                  "tag_columns_include": ["cmf_activity_queue"],
                },
                {
                  "query": """
+                    select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message.date)), 0) as int)
+                      as waiting_time, 'message' as cmf_activity_queue
+                      from message where processing_node in (-1, 0) and message.message not like '%after_tag%'
+                    union all
                    select cast(coalesce(max(UNIX_TIMESTAMP(now()) - UNIX_TIMESTAMP(message_queue.date)), 0) as int)
-                      as message_queue_waiting_time from message_queue
-                      where processing_node in (-1, 0) and message not like '%after_tag%'
+                      as waiting_time, 'message_queue' as cmf_activity_queue
+                      from message_queue where processing_node in (-1, 0) and message_queue.message not like '%after_tag%'
                  """,
-                  "field_columns_include": ["message_queue_waiting_time"],
-                }
+                  "field_columns_include": ["waiting_time"],
+                  "tag_columns_include": ["cmf_activity_queue"],
+
+                },
              ],
-              "tags": dict(partition.get("static-tags", {}), app=application["name"]),
+              "tags": dict(
+                partition.get("static-tags", {}),
+                app=application["name"],
+                name=partition["name"],
+                partition=partition["reference"],
+              ),
            }
          )

@@ -326,7 +333,12 @@ init =
            ],
            "grok_timezone": "Local",
            "name_override": f"{partition['name']}",
-            "tags": dict(partition.get("static-tags", {}), app=application["name"]),
+            "tags": dict(
+              partition.get("static-tags", {}),
+              app=application["name"],
+              name=partition["name"],
+              partition=partition["reference"],
+            ),
          }
        )
    urls = application.get("urls", [])
@@ -344,12 +356,13 @@ init =
        # x509_cert wants a port
        if not parsed_url.port:
          x509_url = parsed_url._replace(netloc=parsed_url.hostname+':443').geturl()
-      inputs["x509_cert"].append({
-        "sources": [x509_url],
-        "tags": {"url": url},
-        "interval": "5h",
-        "tags": {"app": application["name"]},
-      })
+        inputs["x509_cert"].append({
+          "sources": [x509_url],
+          "tags": {"url": url},
+          "interval": "5h",
+          "tags": {"app": application["name"]},
+        })
+      # TODO some kind of GET request every 5 minutes ?

    if application.get("type") == "SlapOS":
      telegraf_slapos_input_config_file = os.path.join(
@@ -360,8 +373,6 @@ init =
          "slapos": [{
            "instance_root": application["instance-root"]}]}})

-      # TODO: supervisor process finder for
-      # https://github.com/influxdata/telegraf/tree/master/plugins/inputs/procstat ?
      telegraf_slapos_input_command = self.options['telegraf-input-slapos-bin']
      inputs["execd"].append({
        "name_override": f"{application['name']}-processes",
@@ -389,20 +400,20 @@ init =
      processors["enum"].append({
        "namepass": [ f"{application['name']}-processes"],
        "mapping": [{
-  #        "tag": "group", # TODO: rename this in input plugin # XXX I don't remember what this means
-          "tag": "slappart",
-          "dest": "partition",
+          "tag": "reference",
+          "dest": "name",
          "value_mappings": partition_mapping,
        }]})

  # TODOs:
+  #  - [ ] use tags partition-id and partition-reference with consistency
  #  - [ ] slapos input
  #    - [x] friendly name of slappart
  #    - [x] strip hashes from -on-watch
+  #    - [ ] process name is incorrect for zope
  #  - [x] activity metrics
-  #  - [ ] alert dashboard
-  #  - [ ] inclu "jerome-dev" partout ???
-  #  - [ ] apdex
+  #  - [?] alert dashboard
+  #  - [?] apdex
  #  - [ ] "job" is bad name in Explore

  options["extra-config"] = toml.dumps({
@@ -410,6 +421,10 @@ init =
    "processors": processors})
  # import pdb; pdb.set_trace()

+# tips:
+#  LogQL to graph total slow query time
+#   sum(rate({partition="mariadb"} | regexp `(Query_time:\s(?P<query_time>\d+)\.)` | unwrap query_time [$__interval]))
+
 # apdex
 # SELECT sum("success") / sum("all") FROM
 #     (SELECT count("duration") AS "all" FROM "jerome-dev-balancer" WHERE $timeFilter GROUP BY time($__interval) fill(null)),
@@ -428,179 +443,131 @@ install =
      f.write(content)


-[loki]
-boltdb-shipper-active-index-directory = ${directory:loki-boltdb-shipper-active-index-directory}
-boltdb-shipper-cache-location = ${directory:loki-boltdb-shipper-cache-location}
-compactor-working-directory = ${directory:loki-compactor-working-directory}
+[loki-server]
 storage-filesystem-directory = ${directory:loki-storage-filesystem-directory}
+path-prefix = ${directory:loki-dir}

-ip = ${instance-parameter:ipv4-random}
-read-1-http-port = 3101
-read-1-grpc-port = 9096
-read-1-memberlist-port = 7947
-read-2-http-port = 3102
-read-2-grpc-port = 9097
-read-2-memberlist-port = 7948
-write-http-port = 3103
-write-grpc-port = 9098
-write-memberlist-port = 7949
-query-frontend-http-port = 3104
-query-frontend-grpc-port = 9099
-query-frontend-memberlist-port = 7950
-
-querier-http-port = 3105
-querier-grpc-port = 9100
-querier-memberlist-port = 7951
-
-index-gateway-http-port = 3106
-index-gateway-grpc-port = 9101
-index-gateway-memberlist-port = 7952
-
-query-scheduler-http-port = 3107
-query-scheduler-grpc-port = 9102
-query-scheduler-memberlist-port = 7953
-
-# compactor
-
-nginx-port = 3100
-url = http://${:ip}:${:nginx-port}
-
+http-port = 3100
+url = https://${:ip6}:${:http-port}
+ipv4 = ${instance-parameter:ipv4-random}
 ipv6 = ${instance-parameter:ipv6-random}
+ca-file = ${loki-server-certificate:ca-file}
+cert-file = ${loki-server-certificate:cert-file}
+key-file = ${loki-server-certificate:key-file}
+# TODO: CRL

-
-[loki-service-macro]
+[loki-service]
 recipe = slapos.cookbook:wrapper
 command-line =
   bash -c 'nice -19 chrt --idle 0 ionice -c3 {{ loki_bin }} \
-      -config.file=${loki-config-file:output} \
-      \
-      -boltdb.shipper.compactor.ring.instance-addr=${loki:ip} \
-      -boltdb.shipper.compactor.ring.instance-id=${:_buildout_section_name_} \
-      -common.embedded-cachering.instance-addr=${loki:ip} \
-      -common.embedded-cachering.instance-id=${:_buildout_section_name_} \
-      -distributor.ring.instance-addr=${loki:ip} \
-      -distributor.ring.instance-id=${:_buildout_section_name_} \
-      -frontend.instance-addr=${loki:ip} \
-      -frontend.instance-port=${loki:query-frontend-grpc-port} \
-      -index-gateway.ring.instance-addr=${loki:ip} \
-      -index-gateway.ring.instance-id=${:_buildout_section_name_} \
-      -memberlist.advertise-port=${:memberlist-port} \
-      -memberlist.bind-port=${:memberlist-port} \
-      -memberlist.nodename=${:_buildout_section_name_} \
-      -query-scheduler.ring.instance-addr=${loki:ip} \
-      -query-scheduler.ring.instance-id=${:_buildout_section_name_} \
-      -ruler.ring.instance-addr=${loki:ip} \
-      -ruler.ring.instance-id=${:_buildout_section_name_} \
-      -server.grpc-listen-port=${:grpc-port} \
-      -server.http-listen-port=${:http-port} \
-      ${:extra-command-line}'
-wrapper-path = ${directory:service}/${:_buildout_section_name_}
-extra-command-line =
+      -config.file=${loki-server-config-file:output} \

+wrapper-path = ${directory:service}/${:_buildout_section_name_}

-[loki-listen-promise-macro]
-<= check-url-available-promise
-url = http://${loki:ip}:${:port}/ready
-
-[loki-read-1-service]
-<= loki-service-macro
-extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-2-grpc-port} -query-scheduler.ring.instance-port=${loki:read-1-grpc-port}
-http-port = ${loki:read-1-http-port}
-grpc-port = ${loki:read-1-grpc-port}
-memberlist-port = ${loki:read-1-memberlist-port}
-
-[loki-read-1-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-read-1-service:http-port}
-
-[loki-read-2-service]
-<= loki-service-macro
-extra-command-line = -target=read -querier.scheduler-address=${loki:ip}:${loki:read-1-grpc-port} -query-scheduler.ring.instance-port=${loki:read-2-grpc-port}
-http-port = ${loki:read-2-http-port}
-grpc-port = ${loki:read-2-grpc-port}
-memberlist-port = ${loki:read-2-memberlist-port}
-
-[loki-read-2-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-read-2-service:http-port}
-
-[loki-write-service]
-<= loki-service-macro
-extra-command-line = -target=write
-http-port = ${loki:write-http-port}
-grpc-port = ${loki:write-grpc-port}
-memberlist-port = ${loki:write-memberlist-port}
-
-[loki-write-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-write-service:http-port}
-
-[loki-querier-service]
-<= loki-service-macro
-extra-command-line = -target=querier -querier.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port} -query-scheduler.ring.instance-port=${loki:querier-grpc-port}
-http-port = ${loki:querier-http-port}
-grpc-port = ${loki:querier-grpc-port}
-memberlist-port = ${loki:querier-memberlist-port}
-
-[loki-querier-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-querier-service:http-port}
-
-[loki-index-gateway-service]
-<= loki-service-macro
-extra-command-line = -target=index-gateway -boltdb.shipper.query-ready-num-days=30
-# XXX -boltdb.shipper.query-ready-num-days=30 useful ?
-http-port = ${loki:index-gateway-http-port}
-grpc-port = ${loki:index-gateway-grpc-port}
-memberlist-port = ${loki:index-gateway-memberlist-port}
-
-[loki-index-gateway-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-index-gateway-service:http-port}
-
-[loki-query-frontend-service]
-<= loki-service-macro
-extra-command-line = -target=query-frontend -frontend.scheduler-address=${loki:ip}:${loki:query-scheduler-grpc-port}
-http-port = ${loki:query-frontend-http-port}
-grpc-port = ${loki:query-frontend-grpc-port}
-memberlist-port = ${loki:query-frontend-memberlist-port}
-
-[loki-query-frontend-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-query-frontend-service:http-port}
-
-[loki-query-scheduler-service]
-<= loki-service-macro
-extra-command-line = -target=query-scheduler
-http-port = ${loki:query-scheduler-http-port}
-grpc-port = ${loki:query-scheduler-grpc-port}
-memberlist-port = ${loki:query-scheduler-memberlist-port}
-
-[loki-query-scheduler-listen-promise]
-<= loki-listen-promise-macro
-port = ${loki-query-scheduler-service:http-port}
-
-
-[loki-config-file]
+[loki-server-config-file]
 <= config-file
 context =
-  section loki loki
+  section loki-server loki

-[loki-nginx-service]
-recipe = slapos.cookbook:wrapper
-command-line =
-   {{ nginx_bin }} -p ${directory:loki-nginx-dir} -c ${loki-nginx-config-file:output}
-wrapper-path = ${directory:service}/${:_buildout_section_name_}
-url = http://${loki:ip}:${loki:nginx-port}
+[loki-server-certificate-init-certificate]
+recipe = slapos.recipe.build
+init =
+  # pre-create a file at the path of the certificate,
+  # so that we can use hash-existing-files options
+  import pathlib
+  cert_file = pathlib.Path(self.buildout['loki-server-certificate']['cert-file'])
+  if not cert_file.parent.exists():
+    cert_file.parent.mkdir()
+  if not cert_file.exists():
+    cert_file.touch()
+
+[loki-server-certificate]
+init = ${loki-server-certificate-init-certificate:init}
+key-file = ${directory:etc}/${:_buildout_section_name_}.key
+cert-file = ${directory:etc}/${:_buildout_section_name_}.crt
+common-name = ${:_buildout_section_name_}
+ca-file = ${directory:etc}/${:_buildout_section_name_}.ca.crt
+crl-file = ${directory:etc}/${:_buildout_section_name_}.crl
+
+{{
+caucase.updater(
+    prefix='loki-server-certificate',
+    buildout_bin_directory=buildout['bin-directory'],
+    updater_path='${directory:service}/loki-server-certificate-updater',
+    url='${caucased:url}',
+    data_dir='${directory:caucase-updater-loki}',
+    crt_path='${loki-server-certificate:cert-file}',
+    ca_path='${loki-server-certificate:ca-file}',
+    crl_path='${loki-server-certificate:crl-file}',
+    key_path='${loki-server-certificate:key-file}',
+    template_csr='${loki-server-certificate-prepare-csr:csr}',
+    openssl=openssl_bin,
+)}}
+
+[loki-server-certificate-csr-config]
+recipe = slapos.recipe.template
+inline =
+  [req]
+  prompt = no
+  req_extensions = req_ext
+  distinguished_name = dn
+  [ dn ]
+  CN = loki-server
+  [ req_ext ]
+  subjectAltName = @alt_names
+  [ alt_names ]
+  IP.1 = ${loki-server:ipv4}
+  IP.2 = ${loki-server:ipv6}
+output = ${buildout:parts-directory}/${:_buildout_section_name_}/${:_buildout_section_name_}
+
+[loki-server-certificate-prepare-csr]
+recipe = plone.recipe.command
+command =
+  if [ ! -f '${:csr}' ] ; then
+   {{ openssl_bin }} req \
+      -newkey rsa \
+      -batch \
+      -new \
+      -sha256 \
+      -nodes \
+      -keyout /dev/null \
+      -config '${loki-server-certificate-csr-config:output}' \
+      -out '${:csr}'
+  fi
+stop-on-error = true
+csr = ${directory:srv}/${:_buildout_section_name_}.csr.pem

-[loki-nginx-listen-promise]
+[loki-server-listen-promise]
 <= check-url-available-promise
-url = ${loki-nginx-service:url}
+url = https://${loki-server:ipv6}:${loki-server:http-port}/ready
+ca-cert-file = ${loki-server:ca-file}
+cert-file = ${:cert-file}
+key-file = ${:key-file}
+
+
+
+[loki-caucased]
+port = 18080
+ip = ${instance-parameter:ipv6-random}
+netloc = [${:ip}]:${:port}
+url = http://${:netloc}/
+
+{{
+caucase.caucased(
+    prefix='loki-caucased',
+    buildout_bin_directory=buildout['bin-directory'],
+    caucased_path='${directory:service}/caucased',
+    backup_dir='${directory:backup-caucased}',
+    data_dir='${directory:srv}/caucased',
+    netloc='${caucased:netloc}',
+    tmp='${directory:tmp}',
+    # server: loki clients: grafana, promtail
+    service_auto_approve_count=5,
+    user_auto_approve_count=1,
+    key_len=2048,
+)}}
+

-[loki-nginx-config-file]
-<= config-file
-context =
-  section loki loki

 [promtail]
 recipe = slapos.cookbook:wrapper
@@ -790,9 +757,13 @@ install =
                  "stages": [
                    {
                      "multiline": {
-                        # TODO
-                        #"firstline": "^# Time: \\d{2}\\d{2}\\d{2}\\s\\d{1,2}\\:\\d{2}\\:\\d{2}",
-                        "firstline": r"^# Time: \d{2}.*",
+                        # between each slow query, slow query log has a first line like:
+                        #   # Time: 231008 16:29:01
+                        # and then a second like:
+                        #   # User@Host: user[user] @  [10.0.71.207]
+                        # but the first line is not repeated for subsequent queries that happens
+                        # at the same second
+                        "firstline": r"(^# Time: \d{2}.*\n^# User@Host:.*|^# User@Host:.*)",
                        "max_wait_time": "3s"
                      }
                    },
@@ -947,14 +918,7 @@ instance-promises =
  ${influxdb-password-promise:wrapper-path}
  ${influxdb-database-ready-promise:wrapper-path}
  ${grafana-listen-promise:path}
-  ${loki-query-frontend-listen-promise:path}
-  ${loki-query-scheduler-listen-promise:path}
-#  ${loki-index-gateway-listen-promise:path}
-  ${loki-querier-listen-promise:path}
-#  ${loki-read-1-listen-promise:path}
-#  ${loki-read-2-listen-promise:path}
-  ${loki-write-listen-promise:path}
-  ${loki-nginx-listen-promise:path}
+  ${loki-listen-promise:path}
  ${promtail-listen-promise:path}
  ${apache-frontend-available-promise:path}

@@ -969,6 +933,6 @@ telegraf-extra-config-dir = ${telegraf:extra-config-dir}
 grafana-url = ${grafana:url}
 grafana-username = ${grafana:admin-user}
 grafana-password = ${grafana:admin-password}
-loki-url = ${loki:url}
+loki-internal-url = ${loki:url}
 promtail-url = ${promtail:url}
 url = ${apache-frontend:connection-secure_access}
--- a/software/grafana/loki-config-file.cfg.in
+++ b/software/grafana/loki-config-file.cfg.in
-# insipired from
-# https://github.com/grafana/loki/blob/1489c1731277c327e3661da182bfc6c90d4559f4/tools/dev/loki-boltdb-storage-s3/docker-compose.yml
-# and othe configuration examples with microservices, because the single binary
-# mode assumes running on 127.0.0.1, but in slapos we want to bind on partition's
-# addresses
-
 auth_enabled: false

-http_prefix:
-
 server:
-  http_listen_address: {{ loki['ip'] }}
-  grpc_listen_address: {{ loki['ip'] }}
+  http_listen_address: {{ loki['ipv6'] }}
+  http_listen_port: {{ loki['http-port'] }}
+  http_tls_ca_path: {{ loki['ca-file'] }}
+  http_tls_cert_path: {{ loki['cert-file'] }}
+  http_tls_key_path: {{ loki['key-file'] }}
+  http_tls_client_auth_type: RequireAndVerifyClientCert
+
+  grpc_listen_address: {{ loki['ipv4'] }}
  grpc_server_max_recv_msg_size: 1.048576e+08
  grpc_server_max_send_msg_size: 1.048576e+08

-# # TODO ?
-# wal:
-#   enabled: true
-#   dir: /loki/wal

 common:
-  compactor_address: http://{{ loki['ip'] }}:{{ loki['write-http-port'] }}
+  ring:
+    instance_addr: {{ loki['ipv4'] }}
+    kvstore:
+      store: inmemory
+  replication_factor: 1
+  path_prefix: {{ loki['path-prefix'] }}

 schema_config:
  configs:
  - from: 2020-05-15
-    store: boltdb-shipper
+    store: tsdb
    object_store: filesystem
-    schema: v11
+    schema: v13
    index:
      prefix: index_
      period: 24h

 storage_config:
-  boltdb_shipper:
-    active_index_directory: {{ loki['boltdb-shipper-active-index-directory'] }}
-    cache_location: {{ loki['boltdb-shipper-cache-location'] }}
  filesystem:
    directory: {{ loki['storage-filesystem-directory'] }}

@@ -45,42 +41,8 @@ limits_config:
  ingestion_rate_mb: 1024
  ingestion_burst_size_mb: 1024

-
-ingester:
-  lifecycler:
-    address: {{ loki['ip'] }}
-    ring:
-      kvstore:
-        store: memberlist
-      replication_factor: 1
-
-compactor:
-    compaction_interval: 1m
-    retention_enabled: true
-    working_directory: {{ loki['compactor-working-directory'] }}
-
-frontend:
-  log_queries_longer_than: 5s
-  compress_responses: true
-  max_outstanding_per_tenant: 2048
-  tail_proxy_url: http://{{ loki['ip'] }}:{{ loki['querier-http-port']}}
-
+# https://github.com/grafana/loki/issues/5143#issuecomment-1697196679
 frontend_worker:
-  scheduler_address: {{ loki['ip'] }}:{{ loki['query-scheduler-grpc-port'] }}
-#testERP5Type
-memberlist:
-  bind_addr:
-    - {{ loki['ip'] }}
-
-  join_members:
-#  - {{ loki['ip'] }}:{{ loki['read-1-memberlist-port'] }}
-  - {{ loki['ip'] }}:{{ loki['querier-memberlist-port'] }}
-#  - {{ loki['ip'] }}:{{ loki['write-memberlist-port'] }}
-
-
-query_scheduler:
-  max_outstanding_requests_per_tenant: 1024
-
-querier:
-  query_ingesters_within: 2h
+   grpc_client_config:
+     max_send_msg_size: 268435456

--- a/software/grafana/loki-nginx-config-file.cfg.in
+++ b/software/grafana/loki-nginx-config-file.cfg.in
-daemon off;
-
-events {
-    worker_connections 1024;
-}
-
-error_log   /dev/stdout;
-
-http {
-  default_type application/octet-stream;
-  access_log   /dev/stdout;
-  sendfile     on;
-  tcp_nopush   on;
-
-  upstream read {
-    server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
-  }
-
-  upstream write {
-    server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
-  }
-
-  upstream cluster {
-    server {{ loki['ip'] }}:{{ loki['write-http-port'] }};
-    server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
-    server {{ loki['ip'] }}:{{ loki['querier-http-port'] }};
-
-  }
-
-  upstream query-frontend {
-    server {{ loki['ip'] }}:{{ loki['query-frontend-http-port'] }};
-  }
-
-  server {
-    listen {{ loki['ip'] }}:{{ loki['nginx-port'] }};
-# XXX while debugging
-    listen [{{ loki['ipv6'] }}]:{{ loki['nginx-port'] }};
-
-    location / {
-        return 200 'OK';
-    }
-
-    location = /ring {
-        proxy_pass       http://cluster$request_uri;
-    }
-
-    location = /memberlist {
-        proxy_pass       http://cluster$request_uri;
-    }
-
-    location = /config {
-        proxy_pass       http://cluster$request_uri;
-    }
-
-    location = /metrics {
-        proxy_pass       http://cluster$request_uri;
-    }
-
-    location = /ready {
-        proxy_pass       http://cluster$request_uri;
-    }
-
-    location = /loki/api/v1/push {
-        proxy_pass       http://write$request_uri;
-    }
-
-    location = /loki/api/v1/tail {
-        proxy_pass       http://read$request_uri;
-        proxy_set_header Upgrade $http_upgrade;
-        proxy_set_header Connection "upgrade";
-    }
-
-    location ~ /loki/api/.* {
-        proxy_pass       http://query-frontend$request_uri;
-    }
-  }
-}
--- a/software/grafana/software.cfg
+++ b/software/grafana/software.cfg
 [buildout]
 extends =
  ../../stack/slapos.cfg
+  ../../stack/caucase/buildout.cfg
  ../../stack/nodejs.cfg
  ../../component/make/buildout.cfg
  ../../component/golang/buildout.cfg
  ../../component/openssl/buildout.cfg
  ../../component/curl/buildout.cfg
  ../../component/dash/buildout.cfg
-  ../../component/nginx/buildout.cfg
+  ../../component/systemd/buildout.cfg
  buildout.hash.cfg

 parts =
@@ -20,16 +21,7 @@ parts =
  grafana-provisioning-datasources-config-file
  grafana-provisioning-dashboards-config-file
  loki-config-file
-  loki-nginx-config-file

-; [nodejs]
-; <= nodejs-16.19.0
-
-
-[gowork]
-# XXX speed up development cycle by not rebuilding workspace on every software run
-# XXX does not work ?
-update-command =

 [go_github.com_grafana_grafana]
 <= go-git-package
@@ -41,7 +33,7 @@ revision      = v10.1.2-0-g8e428858dd
 <= go-git-package
 go.importpath = github.com/grafana/loki
 repository    = https://github.com/grafana/loki
-revision      = v2.9.1-0-gd9d5ed4a1
+revision      = v3.0.0-0-gb4f7181c7

 [go_github.com_influxdata_influxdb]
 <= go-git-package
@@ -59,7 +51,7 @@ revision      = v1.28.1-0-g3ea9ffbe2
 <= go-git-package
 go.importpath = github.com/perrinjerome/telegraf-input-slapos
 repository    = https://github.com/perrinjerome/telegraf-input-slapos
-revision      = v0.0.1-0-gf8981f3
+revision      = v0.0.2-0-gd4c5221

 [go_github.com_prometheus_prometheus]
 <= go-git-package
@@ -84,15 +76,18 @@ install =
  ${go_github.com_perrinjerome_slapos_telegraf_input:location}:./...
  ${go_github.com_prometheus_prometheus:location}:./cmd/...

-# disable cgo, to prevent loki/promtail from using go-systemd
 environment =
-  CGO_ENABLED = 0
-
+  CGO_ENABLED=1
+  CGO_CFLAGS=-I${systemd:location}/include
+buildflags =
+  -tags promtail_journal_enabled
+cpkgpath =
+  ${systemd:location}
 telegraf-bin = ${:bin}/telegraf
 telegraf-input-slapos-bin = ${:bin}/telegraf-input-slapos
 influx-bin = ${:bin}/influx
 influxd-bin = ${:bin}/influxd
-grafana-bin = ${:bin}/grafana-server
+grafana-bin = ${grafana:binpath}/grafana
 grafana-homepath = ${grafana:homepath}
 loki-bin = ${:bin}/loki
 promtail-bin = ${:bin}/promtail
@@ -105,8 +100,6 @@ command = bash -ce "
  . ${gowork:env.sh} && \
  go install github.com/google/wire/cmd/wire@v0.5.0 && \
  wire gen -tags oss ./pkg/server ./pkg/cmd/grafana-cli/runner && \
-  # Unlike loki, grafana _needs_ CGO_ENABLED, so we override here
-  export CGO_ENABLED=1 && \
  go run build.go setup && \
  go run build.go build && \
  export NODE_OPTIONS=--max_old_space_size=8192 && \
@@ -119,6 +112,8 @@ command = bash -ce "
  rm -rf ${buildout:directory}/.cache/yarn/
  "
 homepath = ${go_github.com_grafana_grafana:location}
+# XXX "linux-amd64" is not portable here
+binpath = ${go_github.com_grafana_grafana:location}/bin/linux-amd64
 stop-on-error = true

 [download-file-base]
@@ -143,9 +138,6 @@ url = ${:_profile_base_location_}/${:filename}
 [loki-config-file]
 <= download-file-base

-[loki-nginx-config-file]
-<= download-file-base
-
 [instance-eggs]
 recipe = zc.recipe.egg
 eggs =
@@ -167,13 +159,14 @@ context =
  key grafana_bin gowork:grafana-bin
  key grafana_homepath gowork:grafana-homepath
  key loki_bin gowork:loki-bin
-  raw nginx_bin ${nginx:location}/sbin/nginx
  key promtail_bin gowork:promtail-bin
  key curl_bin :curl-bin
  key dash_bin :dash-bin
 curl-bin = ${curl:location}/bin/curl
 dash-bin = ${dash:location}/bin/dash
 depends = ${instance-eggs:eggs}
+import-list =
+  file caucase caucase-jinja2-library:target

 [versions]
 inotifyx = 0.2.2

--- a/software/grafana/telegraf-config-file.cfg.in
+++ b/software/grafana/telegraf-config-file.cfg.in
@@ -21,7 +21,7 @@

 # Tags can also be specified via a normal map, but only one form at a time:
 [tags]
-  # dc = "us-east-1"
+  computer_id = "{{ slap_configuration['computer'] }}"

 # Configuration for telegraf agent
 [agent]
@@ -42,8 +42,7 @@
  # Run telegraf in debug mode
  debug = false
  # Override default hostname, if empty use os.Hostname()
-  hostname = ""
-
+  hostname =

 ###############################################################################
 #                                  OUTPUTS                                    #
@@ -97,6 +96,7 @@

 [system]

+# TODO: generate this full config file in toml
 {{ extra['extra-config'] }}

 ###############################################################################