From 131c334f9145b9e51fabd59c49d80ea7c92da702 Mon Sep 17 00:00:00 2001 From: Lukasz Sojka Date: Sun, 6 Oct 2024 15:49:24 +0200 Subject: [PATCH] improvement(monitoring): restore monitor with test time range Restoring monitor data with `hydra investigate show-monitor` sets default time range to recent 30m. This is unconvinient as requires to find test related time range manually. This change hardcodes time range in dashboard defaults to range of test duration. closes: https://github.com/scylladb/scylla-cluster-tests/issues/5045 closes: https://github.com/scylladb/argus/issues/454 --- sdcm/cluster.py | 25 ++++++++++++++++++++++++- sdcm/tester.py | 3 +-- unit_tests/test_tester.py | 3 +++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/sdcm/cluster.py b/sdcm/cluster.py index 6383e88c2f..78517f0272 100644 --- a/sdcm/cluster.py +++ b/sdcm/cluster.py @@ -33,7 +33,7 @@ import ipaddress from importlib import import_module from typing import List, Optional, Dict, Union, Set, Iterable, ContextManager, Any, IO, AnyStr, Callable -from datetime import datetime +from datetime import datetime, timezone from textwrap import dedent from functools import cached_property, wraps, lru_cache, partial from collections import defaultdict @@ -5854,6 +5854,8 @@ def start_scylla_monitoring(self, node): self.save_sct_dashboards_config(node) self.save_monitoring_version(node) Path(self.sct_dashboard_json_file).unlink(missing_ok=True) + end_time = self.params["test_duration"] * 60 + time.time() + 120 * 60 # 2h margin + self.update_default_time_range(time.time(), end_time) def save_monitoring_version(self, node): node.remoter.run( @@ -5879,6 +5881,24 @@ def save_sct_dashboards_config(self, node): node.remoter.run('mkdir -p {}'.format(sct_monitoring_addons_dir), ignore_status=True) node.remoter.send_files(src=self.sct_dashboard_json_file, dst=sct_monitoring_addons_dir) + def update_default_time_range(self, start_timestamp: float, end_timestamp: float) -> None: + """ + Specify the Grafana time range for all dashboards by updating the JSON files. + + This method will find all JSON files in the `grafana/build/` directories and replace + the "from" and "to" time range values with the provided start and end timestamps. + """ + start_iso = datetime.fromtimestamp(start_timestamp, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + end_iso = datetime.fromtimestamp(end_timestamp, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + cmd = (f"find {os.path.join(self.monitor_install_path, 'grafana/build/')} -name '*.json' -exec " + f"sed -i 's/\"from\": \"[^\"]*\"/\"from\": \"{start_iso}\"/; " + f"s/\"to\": \"[^\"]*\"/\"to\": \"{end_iso}\"/' {{}} +") + for node in self.nodes: + try: + node.remoter.run(cmd) + except Exception: + LOGGER.error(f"Failed to update time range for Grafana dashboards on {node}", exc_info=True) + @log_run_info def install_scylla_monitoring(self, node): self.install_scylla_monitoring_prereqs(node) @@ -6006,6 +6026,9 @@ def collect_logs(self, storage_dir): def get_grafana_screenshots_from_all_monitors(self, test_start_time=None): # pylint: disable=unused-argument,no-self-use,invalid-name return [] + def update_default_time_range(self, start_timestamp: float, end_timestamp: float) -> None: + pass + class LocalNode(BaseNode): def __init__(self, name, parent_cluster, # pylint: disable=too-many-arguments,unused-argument diff --git a/sdcm/tester.py b/sdcm/tester.py index 0372813ff3..c0816f7337 100644 --- a/sdcm/tester.py +++ b/sdcm/tester.py @@ -2972,6 +2972,7 @@ def tearDown(self): if self.kafka_cluster: with silence(parent=self, name='stopping kafka'): self.kafka_cluster.stop() + self.monitors.update_default_time_range(self.start_time, time.time()) if self.params.get('collect_logs'): self.collect_logs() self.clean_resources() @@ -2982,8 +2983,6 @@ def tearDown(self): self.argus_collect_gemini_results() self.destroy_localhost() self.stop_event_device() - if self.params.get('collect_logs'): - self.collect_sct_logs() with silence(parent=self, name='Cleaning up SSL config directory'): cleanup_ssl_config() diff --git a/unit_tests/test_tester.py b/unit_tests/test_tester.py index 8a14e00ddf..cdccd0da4a 100644 --- a/unit_tests/test_tester.py +++ b/unit_tests/test_tester.py @@ -18,6 +18,7 @@ import time import unittest.mock from time import sleep +from unittest.mock import MagicMock from sdcm.sct_events import Severity from sdcm.sct_events.health import ClusterHealthValidatorEvent @@ -97,6 +98,7 @@ def argus_collect_manager_version(self): pass def tearDown(self): + self.monitors = MagicMock() super().tearDown() self._validate_results() self.events_processes_registry_patcher.stop() @@ -273,6 +275,7 @@ def test(self): pass def tearDown(self): + self.monitors = MagicMock() ClusterTester.tearDown(self) def _validate_results(self):