feat(monitoring): integrate metrics inside the core (#816)

Changes: - removed the node-monitor service - node_exporter is now a core module and installed on all modules: virtual filesystems are not monitored to avoid false alerts on partition free space - on update, install the new metrics module and remove existing node_exporter instances to avoid conflicts on port 9100 - metrics module is installed and running only on the leader node - switch-leader makes sure there is only one instance of metrics module inside the cluster --------- Co-authored-by: Davide Principi <[email protected]>
NethServer · Feb 27, 2025 · f535979 · f535979
1 parent 06c1fe7
commit f535979
Show file tree

Hide file tree

Showing 18 changed files with 253 additions and 236 deletions.
diff --git a/core/build-image.sh b/core/build-image.sh
@@ -78,11 +78,12 @@ printf "RSYNC_IMAGE=${repobase}/rsync:%s\n" "${IMAGETAG:-latest}" >> "${core_env
 printf "RESTIC_IMAGE=${repobase}/restic:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}"
 printf "SUPPORT_IMAGE=${repobase}/support:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}"
 printf "PROMTAIL_IMAGE=docker.io/grafana/promtail:2.9.2\n" >> "${core_env_file}"
+printf "NODE_EXPORTER_IMAGE=quay.io/prometheus/node-exporter:v1.9.0\n" >> "${core_env_file}"
 chmod -c 644 "${core_env_file}"
 source "${core_env_file}"
 buildah add "${container}" ${core_env_file} /etc/nethserver/core.env
 buildah config \
-    --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE}" \
+    --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE} ${NODE_EXPORTER_IMAGE}" \
     --label="org.nethserver.flags=core_module" \
     --entrypoint=/ "${container}"
 buildah commit "${container}" "${repobase}/${reponame}"

diff --git a/core/imageroot/etc/systemd/system/node-monitor.service b/core/imageroot/etc/systemd/system/node-monitor.service
diff --git a/core/imageroot/etc/systemd/system/node_exporter.service b/core/imageroot/etc/systemd/system/node_exporter.service
@@ -0,0 +1,31 @@
+[Unit]
+Description=Prometheus node_exporter
+
+[Service]
+Type=forking
+Environment=PODMAN_SYSTEMD_UNIT=%n
+EnvironmentFile=-/etc/nethserver/core.env
+WorkingDirectory=/var/lib/nethserver/node/state
+Restart=always
+ExecStartPre=/bin/rm -f %t/%N.pid %t/%N.cid
+ExecStartPre=/usr/bin/mkdir -p /run/node_exporter
+ExecStart=/usr/bin/podman run \
+    --conmon-pidfile %t/%N.pid \
+    --cidfile %t/%N.cid \
+    --cgroups=no-conmon \
+    --replace \
+    --name %N \
+    --network=host \
+    --pid=host \
+    -d \
+    -v /:/host:ro,rslave \
+    ${NODE_EXPORTER_IMAGE} \
+    --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/) \
+    --collector.textfile.directory=/host/run/node_exporter \
+    --path.rootfs=/host
+ExecStop=/usr/bin/podman stop --ignore --cidfile %t/%N.cid -t 10
+ExecStopPost=/usr/bin/podman rm --ignore -f --cidfile %t/%N.cid
+PIDFile=%t/%N.pid
+
+[Install]
+WantedBy=default.target
diff --git a/core/imageroot/usr/local/sbin/switch-leader b/core/imageroot/usr/local/sbin/switch-leader
@@ -23,10 +23,12 @@ parser = argparse.ArgumentParser()
 parser.add_argument('--endpoint', required=False, help="Override VPN endpoint setting in Redis DB")
 parser.add_argument('--node', required=True, type=int, help="Node ID of the new leader")
 parser.add_argument('--without-loki', required=False, help="Do not install loki on the new leader node", action='store_false')
+parser.add_argument('--without-metrics', required=False, help="Do not install prometheus and grafana on the new leader node", action='store_false')
 args = parser.parse_args()
 node_id = args.node
 node_endpoint = args.endpoint
 install_loki = args.without_loki
+install_metrics = args.without_metrics
 
 # This command runs under the cluster environment. It always point to the local
 # Redis instance and has admin privileges over it.
@@ -150,5 +152,72 @@ if install_loki and node_id == self_id:
         }))
         redis_pipeline.execute()
 
+# Install metrics on the leader node, remove existing metrics instances
+if install_metrics and node_id == self_id:
+    settings = None
+    custom_alerts = None
+    custom_templates = None
+    # Remove existing metrics instances
+    remove_tasks = []
+    old_metrics_instance = rdb.get('cluster/default_instance/metrics')
+    old_node = rdb.hget('cluster/module_node', old_metrics_instance)
+    if old_metrics_instance:
+        settings = rdb.hgetall(f'module/{old_metrics_instance}/settings')
+        custom_alerts = rdb.hgetall(f'module/{old_metrics_instance}/custom_alerts')
+        custom_templates = rdb.hgetall(f'module/{old_metrics_instance}/custom_templates')
+        subtasks = agent.tasks.run('cluster', 'remove-module', data={
+                "module_id": old_metrics_instance,
+                "preserve_data": False,
+                "force": True,
+        }, endpoint="redis://cluster-leader")
+        # Make sure the node is clean
+        agent.tasks.run_nowait(f'node/{old_node}', 'remove-module', data={
+                "module_id": old_metrics_instance,
+                "preserve_data": False,
+                "check_idle_time": 0,
+        }, endpoint="redis://cluster-leader")
+        # Cleanup traefik routes
+        traefik_id = rdb.get(f'node/{old_node}/default_instance/traefik')   
+        agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={
+                'instance': old_metrics_instance + '_prometheus'
+            },
+            endpoint="redis://cluster-leader"
+        )
+        agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={
+                'instance': old_metrics_instance + '_grafana'
+            },
+            endpoint="redis://cluster-leader"
+        )
+
+    # Install prometheus and grafana on the leader node
+    module = "metrics"
+    result = agent.tasks.run("cluster", "add-module", data={
+            "image": module,
+            "node": node_id,
+            "check_idle_time": 0,
+        },
+        extra={
+            "isNotificationHidden": True,
+        },
+        endpoint="redis://cluster-leader")
+    if result['exit_code'] != 0:
+        print(f"[ERROR] Failed to install {module} on the new leader node: {result['error']}", file=sys.stderr)
+        errors += 1
+    else:
+        mid = result['output']['module_id'] # New module ID
+        result_config = agent.tasks.run(f"module/{mid}", "restore-configuration", data={
+            "settings": settings,
+            "custom_alerts": custom_alerts,
+            "custom_templates": custom_templates,
+        },
+        extra={
+            "isNotificationHidden": True,
+        },
+        endpoint="redis://cluster-leader")
+        if result_config['exit_code'] != 0:
+            print(f"[ERROR] Failed to restart {module} on the new leader node: {result_config['error']}", file=sys.stderr)
+            errors += 1
+
+
 if errors > 0:
     sys.exit(1)
diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update
@@ -85,6 +85,10 @@ add1_module_failures = agent.tasks.runp_brief([
             'image': 'loki',
             'node': NODE_ID,
         }},
+        {"agent_id": "cluster", "action": "add-module", "data": {
+            'image': 'metrics',
+            'node': NODE_ID,
+        }},
     ],
     endpoint = "redis://cluster-leader",
     progress_callback = agent.get_progress_callback(25, 85),

diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+#
+# Copyright (C) 2025 Nethesis S.r.l.
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+set -e
+exec 1>&2
+
+systemctl enable --now node_exporter.service
diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter b/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter
@@ -0,0 +1 @@
+../create-cluster/70start_node_exporter
diff --git a/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor b/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor
@@ -9,11 +9,12 @@ import agent
 import json
 import sys
 import os
+import tempfile
 
-
-FAILED = b'F'
-SUCCESS = b'0'
-UNKNOWN = b'U'
+FAILED = 0
+SUCCESS = 1
+UNKNOWN = -1
+OUTPUT_FILE = "/run/node_exporter/backup.prom"
 
 rdb = agent.redis_connect()
 leader_id = int(rdb.hget('cluster/environment', 'NODE_ID'))
@@ -22,29 +23,38 @@ self_id = int(os.environ['NODE_ID'])
 if self_id != leader_id:
     sys.exit(0) # LEADER ONLY! Do not run this procedure in worker nodes.
 
+# Ensure the output directory exists
+os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
+
 modules = set(rdb.hkeys("cluster/module_node"))
 
-def get_module_backup_status(module_id):
-    backup_status = UNKNOWN
+backups = {}
+for module_id in modules:
     for backup_id in rdb.smembers(f"module/{module_id}/backups"):
+        if not backup_id in backups:
+            name = rdb.hget(f"cluster/backup/{backup_id}", "name")
+            backups[backup_id] = {"name": name, "status": UNKNOWN}
         nerrors = rdb.hget(f"module/{module_id}/backup_status/{backup_id}", "errors") or ""
         try:
             if int(nerrors) > 0:
-                return FAILED
+                backups[backup_id]["status"] = FAILED
         except ValueError:
             pass
         if nerrors == "0":
-            backup_status = SUCCESS
-    return backup_status
+            backups[backup_id]["status"] = SUCCESS
+
+# Create the content to be written in node_exporter format
+content = f"""# HELP node_backup_status Status of the backup (0 = failure, 1 = success, -1 = unknown)
+# TYPE node_backup_status gauge
+"""
+for backup_id in backups:
+    backup = backups[backup_id]
+    content += 'node_backup_status{id="%s",name="%s"} %i\n' % (backup_id, backup.get('name', '_'), backup.get('status'))
 
-cluster_backup_status_list = [get_module_backup_status(module_id) for module_id in modules]
+# Write the content to the output file atomically
+with tempfile.NamedTemporaryFile('w', delete=False, dir=os.path.dirname(OUTPUT_FILE)) as temp_file:
+    temp_file.write(content)
+    temp_filename = temp_file.name
 
-if FAILED in cluster_backup_status_list:
-    cluster_backup_status = FAILED
-elif SUCCESS in cluster_backup_status_list:
-    cluster_backup_status = SUCCESS
-else:
-    cluster_backup_status = UNKNOWN
+os.replace(temp_filename, OUTPUT_FILE)
 
-with open('/run/backup-monitor.dat', 'wb') as fdat:
-    fdat.write(cluster_backup_status)
diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics b/core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+#
+# Copyright (C) 2025 Nethesis S.r.l.
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+import agent
+import agent.tasks
+import os
+import re
+
+rdb = agent.redis_connect(privileged=True)
+is_metrics_installed = rdb.exists(f"node/{os.getenv('NODE_ID')}/default_instance/metrics")
+
+if not is_metrics_installed:
+    # Remove node_exporter modules: they will conflict with the new metrics module
+    for module in rdb.hgetall("cluster/module_node").items():
+        # if module name matches node_exporter<number>, remove it
+        if re.match(r"node_exporter\d+", module[0]):
+            agent.tasks.run('cluster', 'remove-module', data={
+                "module_id": module[0],
+                "preserve_data": False,
+                "force": True,
+            }, endpoint="redis://cluster-leader")
+
+    # Install metrics module
+    add_module_failures = agent.tasks.runp_brief([
+        {"agent_id": "cluster", "action": "add-module", "data": {
+            'image': 'metrics',
+            'node': int(os.getenv("NODE_ID")),
+        }}],
+        endpoint = "redis://cluster-leader",
+    )
+    agent.assert_exp(add_module_failures == 0)
diff --git a/core/imageroot/var/lib/nethserver/node/bin/check-subscription b/core/imageroot/var/lib/nethserver/node/bin/check-subscription
@@ -43,7 +43,6 @@ function enable_nsent()
         # Some services must be disabled in worker nodes
         systemctl disable --now send-heartbeat.service send-inventory.timer send-backup.timer
     fi
-    systemctl enable --now node-monitor.service
     systemctl enable check-subscription.service
 }
 
@@ -60,7 +59,6 @@ function enable_nscom()
         # Some services must be disabled in worker nodes
         systemctl disable --now send-heartbeat.service send-inventory.timer
     fi
-    systemctl enable --now node-monitor.service
     systemctl enable check-subscription.service
 }
 
@@ -70,7 +68,6 @@ elif [[ "${provider}" == "nscom" ]]; then
     enable_nscom
 else
     systemctl disable --now \
-        node-monitor.service \
         send-heartbeat.service \
         send-inventory.timer \
         send-backup.timer