From f53597977bd16b71ee4723e57210925a77f4aa15 Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Thu, 27 Feb 2025 11:27:24 +0100 Subject: [PATCH] feat(monitoring): integrate metrics inside the core (#816) Changes: - removed the node-monitor service - node_exporter is now a core module and installed on all modules: virtual filesystems are not monitored to avoid false alerts on partition free space - on update, install the new metrics module and remove existing node_exporter instances to avoid conflicts on port 9100 - metrics module is installed and running only on the leader node - switch-leader makes sure there is only one instance of metrics module inside the cluster --------- Co-authored-by: Davide Principi --- core/build-image.sh | 3 +- .../etc/systemd/system/node-monitor.service | 15 -- .../etc/systemd/system/node_exporter.service | 31 ++++ core/imageroot/usr/local/sbin/switch-leader | 69 +++++++ .../cluster/actions/create-cluster/50update | 4 + .../create-cluster/70start_node_exporter | 11 ++ .../actions/join-node/70start_node_exporter | 1 + .../backup-status-changed/10node_monitor | 46 +++-- .../update-core-post-modules.d/70metrics | 35 ++++ .../nethserver/node/bin/check-subscription | 3 - .../var/lib/nethserver/node/bin/node-monitor | 169 ------------------ .../node/bin/write-node-monitor-envfile | 27 --- .../var/lib/nethserver/node/uninstall.sh | 2 +- .../node/update-core.d/30restart_subscription | 2 +- .../node/update-core.d/50start_node_exporter | 15 ++ .../node/update-core.d/80node_monitor | 16 ++ docs/core/metrics.md | 39 ++++ docs/core/subscription.md | 1 - 18 files changed, 253 insertions(+), 236 deletions(-) delete mode 100644 core/imageroot/etc/systemd/system/node-monitor.service create mode 100644 core/imageroot/etc/systemd/system/node_exporter.service create mode 100755 core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter create mode 120000 core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter create mode 100755 core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics delete mode 100755 core/imageroot/var/lib/nethserver/node/bin/node-monitor delete mode 100755 core/imageroot/var/lib/nethserver/node/bin/write-node-monitor-envfile create mode 100755 core/imageroot/var/lib/nethserver/node/update-core.d/50start_node_exporter create mode 100755 core/imageroot/var/lib/nethserver/node/update-core.d/80node_monitor create mode 100644 docs/core/metrics.md diff --git a/core/build-image.sh b/core/build-image.sh index ccb8781a1..06c302d8a 100755 --- a/core/build-image.sh +++ b/core/build-image.sh @@ -78,11 +78,12 @@ printf "RSYNC_IMAGE=${repobase}/rsync:%s\n" "${IMAGETAG:-latest}" >> "${core_env printf "RESTIC_IMAGE=${repobase}/restic:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "SUPPORT_IMAGE=${repobase}/support:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "PROMTAIL_IMAGE=docker.io/grafana/promtail:2.9.2\n" >> "${core_env_file}" +printf "NODE_EXPORTER_IMAGE=quay.io/prometheus/node-exporter:v1.9.0\n" >> "${core_env_file}" chmod -c 644 "${core_env_file}" source "${core_env_file}" buildah add "${container}" ${core_env_file} /etc/nethserver/core.env buildah config \ - --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE}" \ + --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE} ${NODE_EXPORTER_IMAGE}" \ --label="org.nethserver.flags=core_module" \ --entrypoint=/ "${container}" buildah commit "${container}" "${repobase}/${reponame}" diff --git a/core/imageroot/etc/systemd/system/node-monitor.service b/core/imageroot/etc/systemd/system/node-monitor.service deleted file mode 100644 index fd66e1dbd..000000000 --- a/core/imageroot/etc/systemd/system/node-monitor.service +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Node monitoring -Wants=redis.service -After=redis.service - -[Service] -Type=simple -OOMScoreAdjust=-1 -ExecStartPre=-runagent -m node write-node-monitor-envfile -ExecStart=runagent -m node node-monitor -Restart=always -SyslogIdentifier=%N - -[Install] -WantedBy=default.target diff --git a/core/imageroot/etc/systemd/system/node_exporter.service b/core/imageroot/etc/systemd/system/node_exporter.service new file mode 100644 index 000000000..86440b878 --- /dev/null +++ b/core/imageroot/etc/systemd/system/node_exporter.service @@ -0,0 +1,31 @@ +[Unit] +Description=Prometheus node_exporter + +[Service] +Type=forking +Environment=PODMAN_SYSTEMD_UNIT=%n +EnvironmentFile=-/etc/nethserver/core.env +WorkingDirectory=/var/lib/nethserver/node/state +Restart=always +ExecStartPre=/bin/rm -f %t/%N.pid %t/%N.cid +ExecStartPre=/usr/bin/mkdir -p /run/node_exporter +ExecStart=/usr/bin/podman run \ + --conmon-pidfile %t/%N.pid \ + --cidfile %t/%N.cid \ + --cgroups=no-conmon \ + --replace \ + --name %N \ + --network=host \ + --pid=host \ + -d \ + -v /:/host:ro,rslave \ + ${NODE_EXPORTER_IMAGE} \ + --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/) \ + --collector.textfile.directory=/host/run/node_exporter \ + --path.rootfs=/host +ExecStop=/usr/bin/podman stop --ignore --cidfile %t/%N.cid -t 10 +ExecStopPost=/usr/bin/podman rm --ignore -f --cidfile %t/%N.cid +PIDFile=%t/%N.pid + +[Install] +WantedBy=default.target diff --git a/core/imageroot/usr/local/sbin/switch-leader b/core/imageroot/usr/local/sbin/switch-leader index 6c9e798b6..63825aa31 100755 --- a/core/imageroot/usr/local/sbin/switch-leader +++ b/core/imageroot/usr/local/sbin/switch-leader @@ -23,10 +23,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--endpoint', required=False, help="Override VPN endpoint setting in Redis DB") parser.add_argument('--node', required=True, type=int, help="Node ID of the new leader") parser.add_argument('--without-loki', required=False, help="Do not install loki on the new leader node", action='store_false') +parser.add_argument('--without-metrics', required=False, help="Do not install prometheus and grafana on the new leader node", action='store_false') args = parser.parse_args() node_id = args.node node_endpoint = args.endpoint install_loki = args.without_loki +install_metrics = args.without_metrics # This command runs under the cluster environment. It always point to the local # Redis instance and has admin privileges over it. @@ -150,5 +152,72 @@ if install_loki and node_id == self_id: })) redis_pipeline.execute() +# Install metrics on the leader node, remove existing metrics instances +if install_metrics and node_id == self_id: + settings = None + custom_alerts = None + custom_templates = None + # Remove existing metrics instances + remove_tasks = [] + old_metrics_instance = rdb.get('cluster/default_instance/metrics') + old_node = rdb.hget('cluster/module_node', old_metrics_instance) + if old_metrics_instance: + settings = rdb.hgetall(f'module/{old_metrics_instance}/settings') + custom_alerts = rdb.hgetall(f'module/{old_metrics_instance}/custom_alerts') + custom_templates = rdb.hgetall(f'module/{old_metrics_instance}/custom_templates') + subtasks = agent.tasks.run('cluster', 'remove-module', data={ + "module_id": old_metrics_instance, + "preserve_data": False, + "force": True, + }, endpoint="redis://cluster-leader") + # Make sure the node is clean + agent.tasks.run_nowait(f'node/{old_node}', 'remove-module', data={ + "module_id": old_metrics_instance, + "preserve_data": False, + "check_idle_time": 0, + }, endpoint="redis://cluster-leader") + # Cleanup traefik routes + traefik_id = rdb.get(f'node/{old_node}/default_instance/traefik') + agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={ + 'instance': old_metrics_instance + '_prometheus' + }, + endpoint="redis://cluster-leader" + ) + agent.tasks.run_nowait(f'module/{traefik_id}', 'delete-route', data={ + 'instance': old_metrics_instance + '_grafana' + }, + endpoint="redis://cluster-leader" + ) + + # Install prometheus and grafana on the leader node + module = "metrics" + result = agent.tasks.run("cluster", "add-module", data={ + "image": module, + "node": node_id, + "check_idle_time": 0, + }, + extra={ + "isNotificationHidden": True, + }, + endpoint="redis://cluster-leader") + if result['exit_code'] != 0: + print(f"[ERROR] Failed to install {module} on the new leader node: {result['error']}", file=sys.stderr) + errors += 1 + else: + mid = result['output']['module_id'] # New module ID + result_config = agent.tasks.run(f"module/{mid}", "restore-configuration", data={ + "settings": settings, + "custom_alerts": custom_alerts, + "custom_templates": custom_templates, + }, + extra={ + "isNotificationHidden": True, + }, + endpoint="redis://cluster-leader") + if result_config['exit_code'] != 0: + print(f"[ERROR] Failed to restart {module} on the new leader node: {result_config['error']}", file=sys.stderr) + errors += 1 + + if errors > 0: sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update index b58643134..08368de4e 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/50update @@ -85,6 +85,10 @@ add1_module_failures = agent.tasks.runp_brief([ 'image': 'loki', 'node': NODE_ID, }}, + {"agent_id": "cluster", "action": "add-module", "data": { + 'image': 'metrics', + 'node': NODE_ID, + }}, ], endpoint = "redis://cluster-leader", progress_callback = agent.get_progress_callback(25, 85), diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter new file mode 100755 index 000000000..ba3ed5255 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_node_exporter @@ -0,0 +1,11 @@ +#!/bin/bash + +# +# Copyright (C) 2025 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +set -e +exec 1>&2 + +systemctl enable --now node_exporter.service diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter b/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter new file mode 120000 index 000000000..ec8c9b795 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_node_exporter @@ -0,0 +1 @@ +../create-cluster/70start_node_exporter \ No newline at end of file diff --git a/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor b/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor index de9d0eec7..df17f5cd4 100755 --- a/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor +++ b/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor @@ -9,11 +9,12 @@ import agent import json import sys import os +import tempfile - -FAILED = b'F' -SUCCESS = b'0' -UNKNOWN = b'U' +FAILED = 0 +SUCCESS = 1 +UNKNOWN = -1 +OUTPUT_FILE = "/run/node_exporter/backup.prom" rdb = agent.redis_connect() leader_id = int(rdb.hget('cluster/environment', 'NODE_ID')) @@ -22,29 +23,38 @@ self_id = int(os.environ['NODE_ID']) if self_id != leader_id: sys.exit(0) # LEADER ONLY! Do not run this procedure in worker nodes. +# Ensure the output directory exists +os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) + modules = set(rdb.hkeys("cluster/module_node")) -def get_module_backup_status(module_id): - backup_status = UNKNOWN +backups = {} +for module_id in modules: for backup_id in rdb.smembers(f"module/{module_id}/backups"): + if not backup_id in backups: + name = rdb.hget(f"cluster/backup/{backup_id}", "name") + backups[backup_id] = {"name": name, "status": UNKNOWN} nerrors = rdb.hget(f"module/{module_id}/backup_status/{backup_id}", "errors") or "" try: if int(nerrors) > 0: - return FAILED + backups[backup_id]["status"] = FAILED except ValueError: pass if nerrors == "0": - backup_status = SUCCESS - return backup_status + backups[backup_id]["status"] = SUCCESS + +# Create the content to be written in node_exporter format +content = f"""# HELP node_backup_status Status of the backup (0 = failure, 1 = success, -1 = unknown) +# TYPE node_backup_status gauge +""" +for backup_id in backups: + backup = backups[backup_id] + content += 'node_backup_status{id="%s",name="%s"} %i\n' % (backup_id, backup.get('name', '_'), backup.get('status')) -cluster_backup_status_list = [get_module_backup_status(module_id) for module_id in modules] +# Write the content to the output file atomically +with tempfile.NamedTemporaryFile('w', delete=False, dir=os.path.dirname(OUTPUT_FILE)) as temp_file: + temp_file.write(content) + temp_filename = temp_file.name -if FAILED in cluster_backup_status_list: - cluster_backup_status = FAILED -elif SUCCESS in cluster_backup_status_list: - cluster_backup_status = SUCCESS -else: - cluster_backup_status = UNKNOWN +os.replace(temp_filename, OUTPUT_FILE) -with open('/run/backup-monitor.dat', 'wb') as fdat: - fdat.write(cluster_backup_status) diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics b/core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics new file mode 100755 index 000000000..590ffcf70 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/cluster/update-core-post-modules.d/70metrics @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2025 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import agent +import agent.tasks +import os +import re + +rdb = agent.redis_connect(privileged=True) +is_metrics_installed = rdb.exists(f"node/{os.getenv('NODE_ID')}/default_instance/metrics") + +if not is_metrics_installed: + # Remove node_exporter modules: they will conflict with the new metrics module + for module in rdb.hgetall("cluster/module_node").items(): + # if module name matches node_exporter, remove it + if re.match(r"node_exporter\d+", module[0]): + agent.tasks.run('cluster', 'remove-module', data={ + "module_id": module[0], + "preserve_data": False, + "force": True, + }, endpoint="redis://cluster-leader") + + # Install metrics module + add_module_failures = agent.tasks.runp_brief([ + {"agent_id": "cluster", "action": "add-module", "data": { + 'image': 'metrics', + 'node': int(os.getenv("NODE_ID")), + }}], + endpoint = "redis://cluster-leader", + ) + agent.assert_exp(add_module_failures == 0) diff --git a/core/imageroot/var/lib/nethserver/node/bin/check-subscription b/core/imageroot/var/lib/nethserver/node/bin/check-subscription index fed924f1b..35faf2a3f 100755 --- a/core/imageroot/var/lib/nethserver/node/bin/check-subscription +++ b/core/imageroot/var/lib/nethserver/node/bin/check-subscription @@ -43,7 +43,6 @@ function enable_nsent() # Some services must be disabled in worker nodes systemctl disable --now send-heartbeat.service send-inventory.timer send-backup.timer fi - systemctl enable --now node-monitor.service systemctl enable check-subscription.service } @@ -60,7 +59,6 @@ function enable_nscom() # Some services must be disabled in worker nodes systemctl disable --now send-heartbeat.service send-inventory.timer fi - systemctl enable --now node-monitor.service systemctl enable check-subscription.service } @@ -70,7 +68,6 @@ elif [[ "${provider}" == "nscom" ]]; then enable_nscom else systemctl disable --now \ - node-monitor.service \ send-heartbeat.service \ send-inventory.timer \ send-backup.timer diff --git a/core/imageroot/var/lib/nethserver/node/bin/node-monitor b/core/imageroot/var/lib/nethserver/node/bin/node-monitor deleted file mode 100755 index dbd605cbe..000000000 --- a/core/imageroot/var/lib/nethserver/node/bin/node-monitor +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -import asyncio -import aiohttp -import multiprocessing -import sys -import os -import agent - -# Alarm states -UNKNOWN = 'U' -CLEAR = '0' -CRITICAL = 'F' - -nmenv = agent.read_envfile("node-monitor.env") - -# Prepare runtime parameters -node_name = 'node_' + os.environ['NODE_ID'] -auth_token = nmenv.get("NMON_ALERT_AUTH_TOKEN") -alert_provider = nmenv.get("NMON_ALERT_PROVIDER") -system_id = nmenv.get("NMON_ALERT_SYSTEM_ID") -dartagnan_url = nmenv.get("NMON_DARTAGNAN_URL") -cpu_count = multiprocessing.cpu_count() - -def parse_meminfo(): - meminfo = {} - with open("/proc/meminfo", encoding='utf-8', newline="\n") as fmem: - for line in fmem: - key, value, unit, _ = (line.rstrip("\n") + ' 0 0 0').split(maxsplit=3) - meminfo[key.rstrip(":")] = value - return meminfo - -async def send_alert(url, value, alert, retry=3): - if value == CLEAR: - astatus = 'OK' - elif value == CRITICAL: - astatus = 'FAILURE' - else: - return - ctimeout = aiohttp.ClientTimeout(total=60.0, connect=50, sock_connect=40, sock_read=10) - ex_message = None - try: - async with aiohttp.ClientSession(timeout=ctimeout, headers={'Authorization': 'token ' + auth_token}) as cs: - await cs.post(url, json={"lk":system_id, "alert_id": alert, "status": astatus}) - except Exception as ex: - ex_message = str(ex) - if ex_message: - if retry > 0: - print(agent.SD_DEBUG + "send_alert temporary failure:", value, alert, ex_message, f"-- retrying in 20 seconds", file=sys.stderr) - await asyncio.sleep(20) - await send_alert(url, value, alert, retry - 1) - else: - print(agent.SD_ERR + "send_alert aborted:", value, alert, ex_message, file=sys.stderr) - -async def raise_alert(value, alert): - if value == CLEAR: - svalue = 'CLEAR' - elif value == CRITICAL: - svalue = 'CRITICAL' - else: - svalue = 'UNKNOWN' - if alert.startswith('load'): - print('NOTICE', svalue, alert, file=sys.stderr) - return # do not send load alarm at all, just log it - else: - print('ALERT', svalue, alert, file=sys.stderr) - if alert_provider == 'nsent': - await send_alert('https://my.nethesis.it/isa/alerts/store', value, alert) - elif alert_provider == 'nscom': - await send_alert(f'{dartagnan_url}/machine/alerts/store', value, alert) - -async def check_swap_presence(): - meminfo = parse_meminfo() - if int(meminfo['SwapTotal']) == 0: - return (CRITICAL, 'swap:notpresent:' + node_name) - else: - return (CLEAR, 'swap:notpresent:' + node_name) - -async def check_swap(): - meminfo = parse_meminfo() - if int(meminfo['SwapTotal']) == 0: - return (CLEAR, 'swap:full:' + node_name) - try: - swapfree_ratio = float(meminfo['SwapFree']) / int(meminfo['SwapTotal']) - except: - swapfree_ratio = 0.0 - if swapfree_ratio < 0.2: - state = CRITICAL - else: - state = CLEAR - return (state, 'swap:full:' + node_name) - - -async def check_mountpoints(mp_path, mp_name=None): - ostatvfs = os.statvfs(mp_path) - if not mp_name: - mp_name = mp_path.strip("/").replace("/", "_") - free_inodes_ratio = float(ostatvfs.f_ffree) / ostatvfs.f_files - free_blocks_ratio = float(ostatvfs.f_bfree) / ostatvfs.f_blocks - if free_inodes_ratio < 0.02 or free_blocks_ratio < 0.1: - state = CRITICAL - else: - state = CLEAR - return (state, f'fs:{mp_name}:full:' + node_name) - -async def check_systemload(): - with open("/proc/loadavg", encoding='utf-8', newline="\n") as fload: - fields = fload.readline().split() - # consider critical 1 minute load average greater than CPU count - if float(fields[0]) > cpu_count: - state = CRITICAL - else: - state = CLEAR - return (state, 'load:high:' + node_name) - -async def check_backup(): - state = UNKNOWN - try: - with open("/run/backup-monitor.dat", "rb") as fbackup: - fstate = fbackup.read(1) - if fstate == b'0': - state = CLEAR - elif fstate == b'F': - state = CRITICAL - except FileNotFoundError: - pass - return (state, 'system:backup:failure') - -async def monitor_loop(check_function, fargs=[], fkwargs={}, period=2, hysteresis=4): - # hysteresis counts how many past checks are considered to build the - # alarm state - state_buffer = [CLEAR,] * hysteresis - alarm_state = UNKNOWN # force initial transition to CLEAR - head = 0 - - def current_state(): - return state_buffer[head] - - # the alarm state changes when all past checks are equal and different - # from the last alarm state - def check_alarm(): - return alarm_state != current_state() and all(x == current_state() for x in state_buffer) - - while True: - (value, alert) = await check_function(*fargs, **fkwargs) - state_buffer[head] = value # update current state - if check_alarm(): - alarm_state = value - await raise_alert(value, alert) - await asyncio.sleep(period) - head = (head + 1) % hysteresis - -async def main(): - # Run each check inside a monitor loop in parallel: - await asyncio.gather( - monitor_loop(check_swap, period=3, hysteresis=10), # 30 seconds - monitor_loop(check_swap_presence, period=60, hysteresis=30), # 30 minutes - monitor_loop(check_systemload, period=5, hysteresis=24), # 2 minutes - monitor_loop(check_mountpoints, fargs=['/', 'root'], period=15, hysteresis=4), # 1 minute - monitor_loop(check_mountpoints, fargs=['/boot', 'boot'], period=15, hysteresis=4), # 1 minute - monitor_loop(check_backup, period=10, hysteresis=1), - ) - -asyncio.run(main()) diff --git a/core/imageroot/var/lib/nethserver/node/bin/write-node-monitor-envfile b/core/imageroot/var/lib/nethserver/node/bin/write-node-monitor-envfile deleted file mode 100755 index 5ca8adff6..000000000 --- a/core/imageroot/var/lib/nethserver/node/bin/write-node-monitor-envfile +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -import sys -import os -import agent - -rdb = agent.redis_connect(use_replica=True) - -osubscription = rdb.hgetall('cluster/subscription') -node_name = rdb.get(f'node/{os.environ["NODE_ID"]}/ui_name') or f'node/{os.environ["NODE_ID"]}' - -if osubscription: - agent.write_envfile("node-monitor.env", { - "NMON_NODE_NAME": node_name, - "NMON_ALERT_AUTH_TOKEN": osubscription["auth_token"], - "NMON_ALERT_PROVIDER": osubscription["provider"], - "NMON_ALERT_SYSTEM_ID": osubscription["system_id"], - "NMON_DARTAGNAN_URL": osubscription.get("dartagnan_url", ""), - }) -else: - print(agent.SD_WARNING + "Could not retrieve subscription information from Redis", file=sys.stderr) - agent.write_envfile("node-monitor.env", {}) diff --git a/core/imageroot/var/lib/nethserver/node/uninstall.sh b/core/imageroot/var/lib/nethserver/node/uninstall.sh index d97642781..9a5fe1b9e 100644 --- a/core/imageroot/var/lib/nethserver/node/uninstall.sh +++ b/core/imageroot/var/lib/nethserver/node/uninstall.sh @@ -77,7 +77,7 @@ systemctl disable --now \ phonehome.timer \ rclone-webdav.service \ promtail.service \ - node-monitor.service \ + node_exporter.service \ send-heartbeat.service \ send-inventory.timer \ send-backup.timer \ diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/30restart_subscription b/core/imageroot/var/lib/nethserver/node/update-core.d/30restart_subscription index 2bbb7ec29..b5f8d149a 100755 --- a/core/imageroot/var/lib/nethserver/node/update-core.d/30restart_subscription +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/30restart_subscription @@ -9,4 +9,4 @@ exec 1>&2 set -e # Restart subscription long-running services, if enabled -systemctl try-restart node-monitor.service send-heartbeat.service +systemctl try-restart send-heartbeat.service diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/50start_node_exporter b/core/imageroot/var/lib/nethserver/node/update-core.d/50start_node_exporter new file mode 100755 index 000000000..bf2d8421e --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/50start_node_exporter @@ -0,0 +1,15 @@ +#!/bin/bash + +# +# Copyright (C) 2025 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +exec 1>&2 +set -e + +if systemctl is-enabled --quiet node_exporter ; then + systemctl restart node_exporter.service || : +else + systemctl enable --now node_exporter.service +fi diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/80node_monitor b/core/imageroot/var/lib/nethserver/node/update-core.d/80node_monitor new file mode 100755 index 000000000..558e7787a --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/80node_monitor @@ -0,0 +1,16 @@ +#!/bin/bash + +# +# Copyright (C) 2025 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +exec 1>&2 +set -e + +if systemctl -q is-active node-monitor.service; then + echo "Disabling node-monitor as service" + systemctl disable --now node-monitor + rm -f /etc/systemd/system/node-monitor.service + systemctl daemon-reload +fi diff --git a/docs/core/metrics.md b/docs/core/metrics.md new file mode 100644 index 000000000..64634595f --- /dev/null +++ b/docs/core/metrics.md @@ -0,0 +1,39 @@ +--- +layout: default +title: Metrics and alerting +nav_order: 16 +parent: Core +--- + +# Metrics and alerting + +The core provides a module named `metrics` that collects metrics from the nodes +and sends alerts to the Nethesis portal. The module is installed by the +`create-cluster` action. + +The module includes the following services: + +- [Prometheus](https://prometheus.io/) +- [Alertmanager](https://prometheus.io/docs/alerting/alertmanager/) +- [Grafana](https://grafana.com/) +- [alert-proxy](alert-proxy/README.md) + +Key points: + +- Single instance running on the leader node +- Monitors all cluster nodes automatically +- Removed if the leader node becomes a worker +- Prometheus: port 9091 +- Alertmanager: port 9093 +- alert-proxy: port 9095 +- Grafana: port 3000 (disabled by default, enabled with Traefik route) + +Configuration: + +- Prometheus and Alertmanager configurations are recreated on Prometheus restart +- Module restarts on node addition/removal +- alert-proxy restarts on subscription change to send alerts to Nethesis portals + +If a subscription is enabled, alerts are sent to Nethesis portals by default. + +Please refer to the module [README](https://github.com/NethServer/ns8-metrics) for details on alert configuration and customization. \ No newline at end of file diff --git a/docs/core/subscription.md b/docs/core/subscription.md index 556913afb..482dd5f2c 100644 --- a/docs/core/subscription.md +++ b/docs/core/subscription.md @@ -30,7 +30,6 @@ managed by `check-subscription`: - `send-inventory` (leader only) Run at night, send the cluster inventory to the provider - `send-heartbeat` (leader only) Run every 10 minutes to signal the cluster liveness - `send-backup` (leader only) Run at night, send the cluster encrypted backup to the provider -- `node-monitor` Check the node loadavg, root/boot space, free swap and raises alarms - `apply-updates` (leader only) Run at night, apply core, modules and OS updates according to configuration in Redis key `cluster/apply_updates`. See also [core updates]({{site.baseurl}}/core/updates)