Skip to content

Commit

Permalink
feat(monitoring): integrate metrics inside the core (#816)
Browse files Browse the repository at this point in the history
Changes:
- removed the node-monitor service
- node_exporter is now a core module and installed on all modules: virtual filesystems are not monitored to avoid
  false alerts on partition free space
- on update, install the new metrics module and remove existing node_exporter instances to avoid conflicts on port 9100
- metrics module is installed and running only on the leader node
- switch-leader makes sure there is only one instance of metrics module inside the cluster

---------

Co-authored-by: Davide Principi <[email protected]>
  • Loading branch information
gsanchietti and DavidePrincipi authored Feb 27, 2025
1 parent 06c1fe7 commit f535979
Show file tree
Hide file tree
Showing 18 changed files with 253 additions and 236 deletions.
3 changes: 2 additions & 1 deletion core/build-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,12 @@ printf "RSYNC_IMAGE=${repobase}/rsync:%s\n" "${IMAGETAG:-latest}" >> "${core_env
printf "RESTIC_IMAGE=${repobase}/restic:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}"
printf "SUPPORT_IMAGE=${repobase}/support:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}"
printf "PROMTAIL_IMAGE=docker.io/grafana/promtail:2.9.2\n" >> "${core_env_file}"
printf "NODE_EXPORTER_IMAGE=quay.io/prometheus/node-exporter:v1.9.0\n" >> "${core_env_file}"
chmod -c 644 "${core_env_file}"
source "${core_env_file}"
buildah add "${container}" ${core_env_file} /etc/nethserver/core.env
buildah config \
--label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE}" \
--label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE} ${NODE_EXPORTER_IMAGE}" \
--label="org.nethserver.flags=core_module" \
--entrypoint=/ "${container}"
buildah commit "${container}" "${repobase}/${reponame}"
Expand Down
15 changes: 0 additions & 15 deletions core/imageroot/etc/systemd/system/node-monitor.service

This file was deleted.

31 changes: 31 additions & 0 deletions core/imageroot/etc/systemd/system/node_exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[Unit]
Description=Prometheus node_exporter

[Service]
Type=forking
Environment=PODMAN_SYSTEMD_UNIT=%n
EnvironmentFile=-/etc/nethserver/core.env
WorkingDirectory=/var/lib/nethserver/node/state
Restart=always
ExecStartPre=/bin/rm -f %t/%N.pid %t/%N.cid
ExecStartPre=/usr/bin/mkdir -p /run/node_exporter
ExecStart=/usr/bin/podman run \
--conmon-pidfile %t/%N.pid \
--cidfile %t/%N.cid \
--cgroups=no-conmon \
--replace \
--name %N \
--network=host \
--pid=host \
-d \
-v /:/host:ro,rslave \
${NODE_EXPORTER_IMAGE} \
--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/) \
--collector.textfile.directory=/host/run/node_exporter \
--path.rootfs=/host
ExecStop=/usr/bin/podman stop --ignore --cidfile %t/%N.cid -t 10
ExecStopPost=/usr/bin/podman rm --ignore -f --cidfile %t/%N.cid
PIDFile=%t/%N.pid

[Install]
WantedBy=default.target
69 changes: 69 additions & 0 deletions core/imageroot/usr/local/sbin/switch-leader
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ parser = argparse.ArgumentParser()
parser.add_argument('--endpoint', required=False, help="Override VPN endpoint setting in Redis DB")
parser.add_argument('--node', required=True, type=int, help="Node ID of the new leader")
parser.add_argument('--without-loki', required=False, help="Do not install loki on the new leader node", action='store_false')
parser.add_argument('--without-metrics', required=False, help="Do not install prometheus and grafana on the new leader node", action='store_false')
args = parser.parse_args()
node_id = args.node
node_endpoint = args.endpoint
install_loki = args.without_loki
install_metrics = args.without_metrics

# This command runs under the cluster environment. It always point to the local
# Redis instance and has admin privileges over it.
Expand Down Expand Up @@ -150,5 +152,72 @@ if install_loki and node_id == self_id:
}))
redis_pipeline.execute()

# Install metrics on the leader node, remove existing metrics instances
if install_metrics and node_id == self_id:
settings = None
custom_alerts = None
custom_templates = None
# Remove existing metrics instances
remove_tasks = []
old_metrics_instance = rdb.get('cluster/default_instance/metrics')
old_node = rdb.hget('cluster/module_node', old_metrics_instance)
if old_metrics_instance:
settings = rdb.hgetall(f'module/{old_metrics_instance}/settings')
custom_alerts = rdb.hgetall(f'module/{old_metrics_instance}/custom_alerts')
custom_templates = rdb.hgetall(f'module/{old_metrics_instance}/custom_templates')
subtasks = agent.tasks.run('cluster', 'remove-module', data={
"module_id": old_metrics_instance,
"preserve_data": False,
"force": True,
}, endpoint="redis://cluster-leader")
# Make sure the node is clean
agent.tasks.run_nowait(f'node/{old_node}', 'remove-module', data={
"module_id": old_metrics_instance,
"preserve_data": False,
"check_idle_time": 0,
}, endpoint="redis://cluster-leader")
# Cleanup traefik routes
traefik_id = rdb.get(f'node/{old_node}/default_instance/traefik')
agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={
'instance': old_metrics_instance + '_prometheus'
},
endpoint="redis://cluster-leader"
)
agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={
'instance': old_metrics_instance + '_grafana'
},
endpoint="redis://cluster-leader"
)

# Install prometheus and grafana on the leader node
module = "metrics"
result = agent.tasks.run("cluster", "add-module", data={
"image": module,
"node": node_id,
"check_idle_time": 0,
},
extra={
"isNotificationHidden": True,
},
endpoint="redis://cluster-leader")
if result['exit_code'] != 0:
print(f"[ERROR] Failed to install {module} on the new leader node: {result['error']}", file=sys.stderr)
errors += 1
else:
mid = result['output']['module_id'] # New module ID
result_config = agent.tasks.run(f"module/{mid}", "restore-configuration", data={
"settings": settings,
"custom_alerts": custom_alerts,
"custom_templates": custom_templates,
},
extra={
"isNotificationHidden": True,
},
endpoint="redis://cluster-leader")
if result_config['exit_code'] != 0:
print(f"[ERROR] Failed to restart {module} on the new leader node: {result_config['error']}", file=sys.stderr)
errors += 1


if errors > 0:
sys.exit(1)
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ add1_module_failures = agent.tasks.runp_brief([
'image': 'loki',
'node': NODE_ID,
}},
{"agent_id": "cluster", "action": "add-module", "data": {
'image': 'metrics',
'node': NODE_ID,
}},
],
endpoint = "redis://cluster-leader",
progress_callback = agent.get_progress_callback(25, 85),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

#
# Copyright (C) 2025 Nethesis S.r.l.
# SPDX-License-Identifier: GPL-3.0-or-later
#

set -e
exec 1>&2

systemctl enable --now node_exporter.service
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ import agent
import json
import sys
import os
import tempfile


FAILED = b'F'
SUCCESS = b'0'
UNKNOWN = b'U'
FAILED = 0
SUCCESS = 1
UNKNOWN = -1
OUTPUT_FILE = "/run/node_exporter/backup.prom"

rdb = agent.redis_connect()
leader_id = int(rdb.hget('cluster/environment', 'NODE_ID'))
Expand All @@ -22,29 +23,38 @@ self_id = int(os.environ['NODE_ID'])
if self_id != leader_id:
sys.exit(0) # LEADER ONLY! Do not run this procedure in worker nodes.

# Ensure the output directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

modules = set(rdb.hkeys("cluster/module_node"))

def get_module_backup_status(module_id):
backup_status = UNKNOWN
backups = {}
for module_id in modules:
for backup_id in rdb.smembers(f"module/{module_id}/backups"):
if not backup_id in backups:
name = rdb.hget(f"cluster/backup/{backup_id}", "name")
backups[backup_id] = {"name": name, "status": UNKNOWN}
nerrors = rdb.hget(f"module/{module_id}/backup_status/{backup_id}", "errors") or ""
try:
if int(nerrors) > 0:
return FAILED
backups[backup_id]["status"] = FAILED
except ValueError:
pass
if nerrors == "0":
backup_status = SUCCESS
return backup_status
backups[backup_id]["status"] = SUCCESS

# Create the content to be written in node_exporter format
content = f"""# HELP node_backup_status Status of the backup (0 = failure, 1 = success, -1 = unknown)
# TYPE node_backup_status gauge
"""
for backup_id in backups:
backup = backups[backup_id]
content += 'node_backup_status{id="%s",name="%s"} %i\n' % (backup_id, backup.get('name', '_'), backup.get('status'))

cluster_backup_status_list = [get_module_backup_status(module_id) for module_id in modules]
# Write the content to the output file atomically
with tempfile.NamedTemporaryFile('w', delete=False, dir=os.path.dirname(OUTPUT_FILE)) as temp_file:
temp_file.write(content)
temp_filename = temp_file.name

if FAILED in cluster_backup_status_list:
cluster_backup_status = FAILED
elif SUCCESS in cluster_backup_status_list:
cluster_backup_status = SUCCESS
else:
cluster_backup_status = UNKNOWN
os.replace(temp_filename, OUTPUT_FILE)

with open('/run/backup-monitor.dat', 'wb') as fdat:
fdat.write(cluster_backup_status)
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

#
# Copyright (C) 2025 Nethesis S.r.l.
# SPDX-License-Identifier: GPL-3.0-or-later
#

import agent
import agent.tasks
import os
import re

rdb = agent.redis_connect(privileged=True)
is_metrics_installed = rdb.exists(f"node/{os.getenv('NODE_ID')}/default_instance/metrics")

if not is_metrics_installed:
# Remove node_exporter modules: they will conflict with the new metrics module
for module in rdb.hgetall("cluster/module_node").items():
# if module name matches node_exporter<number>, remove it
if re.match(r"node_exporter\d+", module[0]):
agent.tasks.run('cluster', 'remove-module', data={
"module_id": module[0],
"preserve_data": False,
"force": True,
}, endpoint="redis://cluster-leader")

# Install metrics module
add_module_failures = agent.tasks.runp_brief([
{"agent_id": "cluster", "action": "add-module", "data": {
'image': 'metrics',
'node': int(os.getenv("NODE_ID")),
}}],
endpoint = "redis://cluster-leader",
)
agent.assert_exp(add_module_failures == 0)
3 changes: 0 additions & 3 deletions core/imageroot/var/lib/nethserver/node/bin/check-subscription
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ function enable_nsent()
# Some services must be disabled in worker nodes
systemctl disable --now send-heartbeat.service send-inventory.timer send-backup.timer
fi
systemctl enable --now node-monitor.service
systemctl enable check-subscription.service
}

Expand All @@ -60,7 +59,6 @@ function enable_nscom()
# Some services must be disabled in worker nodes
systemctl disable --now send-heartbeat.service send-inventory.timer
fi
systemctl enable --now node-monitor.service
systemctl enable check-subscription.service
}

Expand All @@ -70,7 +68,6 @@ elif [[ "${provider}" == "nscom" ]]; then
enable_nscom
else
systemctl disable --now \
node-monitor.service \
send-heartbeat.service \
send-inventory.timer \
send-backup.timer
Expand Down
Loading

0 comments on commit f535979

Please sign in to comment.