Skip to content

Commit

Permalink
improvement(monitoring): restore monitor with test time range
Browse files Browse the repository at this point in the history
Restoring monitor data with `hydra investigate show-monitor` sets
default time range to recent 30m. This is unconvinient as requires to
find test related time range manually.

This change hardcodes time range in dashboard defaults to range of test
duration.

closes: #5045
closes: scylladb/argus#454
  • Loading branch information
soyacz authored and fruch committed Oct 10, 2024
1 parent 68a4a7b commit 131c334
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
25 changes: 24 additions & 1 deletion sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import ipaddress
from importlib import import_module
from typing import List, Optional, Dict, Union, Set, Iterable, ContextManager, Any, IO, AnyStr, Callable
from datetime import datetime
from datetime import datetime, timezone
from textwrap import dedent
from functools import cached_property, wraps, lru_cache, partial
from collections import defaultdict
Expand Down Expand Up @@ -5854,6 +5854,8 @@ def start_scylla_monitoring(self, node):
self.save_sct_dashboards_config(node)
self.save_monitoring_version(node)
Path(self.sct_dashboard_json_file).unlink(missing_ok=True)
end_time = self.params["test_duration"] * 60 + time.time() + 120 * 60 # 2h margin
self.update_default_time_range(time.time(), end_time)

def save_monitoring_version(self, node):
node.remoter.run(
Expand All @@ -5879,6 +5881,24 @@ def save_sct_dashboards_config(self, node):
node.remoter.run('mkdir -p {}'.format(sct_monitoring_addons_dir), ignore_status=True)
node.remoter.send_files(src=self.sct_dashboard_json_file, dst=sct_monitoring_addons_dir)

def update_default_time_range(self, start_timestamp: float, end_timestamp: float) -> None:
"""
Specify the Grafana time range for all dashboards by updating the JSON files.
This method will find all JSON files in the `grafana/build/<subdir>` directories and replace
the "from" and "to" time range values with the provided start and end timestamps.
"""
start_iso = datetime.fromtimestamp(start_timestamp, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
end_iso = datetime.fromtimestamp(end_timestamp, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
cmd = (f"find {os.path.join(self.monitor_install_path, 'grafana/build/')} -name '*.json' -exec "
f"sed -i 's/\"from\": \"[^\"]*\"/\"from\": \"{start_iso}\"/; "
f"s/\"to\": \"[^\"]*\"/\"to\": \"{end_iso}\"/' {{}} +")
for node in self.nodes:
try:
node.remoter.run(cmd)
except Exception:
LOGGER.error(f"Failed to update time range for Grafana dashboards on {node}", exc_info=True)

@log_run_info
def install_scylla_monitoring(self, node):
self.install_scylla_monitoring_prereqs(node)
Expand Down Expand Up @@ -6006,6 +6026,9 @@ def collect_logs(self, storage_dir):
def get_grafana_screenshots_from_all_monitors(self, test_start_time=None): # pylint: disable=unused-argument,no-self-use,invalid-name
return []

def update_default_time_range(self, start_timestamp: float, end_timestamp: float) -> None:
pass


class LocalNode(BaseNode):
def __init__(self, name, parent_cluster, # pylint: disable=too-many-arguments,unused-argument
Expand Down
3 changes: 1 addition & 2 deletions sdcm/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2972,6 +2972,7 @@ def tearDown(self):
if self.kafka_cluster:
with silence(parent=self, name='stopping kafka'):
self.kafka_cluster.stop()
self.monitors.update_default_time_range(self.start_time, time.time())
if self.params.get('collect_logs'):
self.collect_logs()
self.clean_resources()
Expand All @@ -2982,8 +2983,6 @@ def tearDown(self):
self.argus_collect_gemini_results()
self.destroy_localhost()
self.stop_event_device()
if self.params.get('collect_logs'):
self.collect_sct_logs()
with silence(parent=self, name='Cleaning up SSL config directory'):
cleanup_ssl_config()

Expand Down
3 changes: 3 additions & 0 deletions unit_tests/test_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import time
import unittest.mock
from time import sleep
from unittest.mock import MagicMock

from sdcm.sct_events import Severity
from sdcm.sct_events.health import ClusterHealthValidatorEvent
Expand Down Expand Up @@ -97,6 +98,7 @@ def argus_collect_manager_version(self):
pass

def tearDown(self):
self.monitors = MagicMock()
super().tearDown()
self._validate_results()
self.events_processes_registry_patcher.stop()
Expand Down Expand Up @@ -273,6 +275,7 @@ def test(self):
pass

def tearDown(self):
self.monitors = MagicMock()
ClusterTester.tearDown(self)

def _validate_results(self):
Expand Down

0 comments on commit 131c334

Please sign in to comment.