From a057ff23d7ddfa71c75ad229ade6e80c38079682 Mon Sep 17 00:00:00 2001 From: Liapkovich Date: Tue, 12 Nov 2024 15:04:08 +0100 Subject: [PATCH] feature(manager): add restore bandwidth characteristics to Argus results Restore metrics (download and load&stream bandwidths) were introduced in Manager 3.4. Including these characteristics in restore benchmark Argus graphs significantly improves their informativeness. --- mgmt_cli_test.py | 5 +++++ sdcm/argus_results.py | 6 +++++- sdcm/mgmt/cli.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/mgmt_cli_test.py b/mgmt_cli_test.py index cc1fa5f25a..6877b28df4 100644 --- a/mgmt_cli_test.py +++ b/mgmt_cli_test.py @@ -1373,6 +1373,11 @@ def _send_restore_results_to_argus(self, task: RestoreTask, manager_version_time "repair time": repair_time, "total": total_restore_time, } + download_bw, load_and_stream_bw = task.download_bw, task.load_and_stream_bw + if download_bw: + results["download bandwidth"] = download_bw + if load_and_stream_bw: + results["l&s bandwidth"] = load_and_stream_bw send_manager_benchmark_results_to_argus( argus_client=self.test_config.argus_client(), result=results, diff --git a/sdcm/argus_results.py b/sdcm/argus_results.py index a2d3f4f15f..444d2016bf 100644 --- a/sdcm/argus_results.py +++ b/sdcm/argus_results.py @@ -103,13 +103,17 @@ class Meta: description = "Restore benchmark" Columns = [ ColumnMetadata(name="restore time", unit="s", type=ResultType.DURATION, higher_is_better=False), + ColumnMetadata(name="download bandwidth", unit="MiB/s/shard", type=ResultType.FLOAT, higher_is_better=True), + ColumnMetadata(name="l&s bandwidth", unit="MiB/s/shard", type=ResultType.FLOAT, higher_is_better=True), ColumnMetadata(name="repair time", unit="s", type=ResultType.DURATION, higher_is_better=False), ColumnMetadata(name="total", unit="s", type=ResultType.DURATION, higher_is_better=False), ] ValidationRules = { "restore time": ValidationRule(best_pct=10), + "download bandwidth": ValidationRule(best_pct=10), + "l&s bandwidth": ValidationRule(best_pct=10), "repair time": ValidationRule(best_pct=10), - "total": ValidationRule(best_pct=10) + "total": ValidationRule(best_pct=10), } diff --git a/sdcm/mgmt/cli.py b/sdcm/mgmt/cli.py index 973e096b88..fd3c7248e9 100644 --- a/sdcm/mgmt/cli.py +++ b/sdcm/mgmt/cli.py @@ -16,6 +16,7 @@ import time import logging import datetime +import re from pathlib import Path from re import findall from textwrap import dedent @@ -506,6 +507,48 @@ class RestoreTask(ManagerTask): def __init__(self, task_id, cluster_id, manager_node): ManagerTask.__init__(self, task_id=task_id, cluster_id=cluster_id, manager_node=manager_node) + @property + def download_bw(self) -> float | None: + """Restore download phase bandwidth in MiB/s/shard""" + # ... + # Bandwidth: + # - Download: 22.313MiB/s/shard + # - Load&stream: 3.556MiB/s/shard + # + # ... + res = self.progress_string() + + try: + download_bandwidth_str = res[res.index(['Bandwidth:']) + 1][0] + except ValueError: + LOGGER.warning("Failed to extract Download bandwidth from the sctool restore progress output." + "Check Manager version, bandwidth metrics are supported starting from 3.4.") + return None + + download_bandwidth_match = re.search(r"(\d+\.\d+)", download_bandwidth_str) + return float(download_bandwidth_match.group(1)) + + @property + def load_and_stream_bw(self) -> float | None: + """Restore load&stream phase bandwidth in MiB/s/shard""" + # ... + # Bandwidth: + # - Download: 22.313MiB/s/shard + # - Load&stream: 3.556MiB/s/shard + # + # ... + res = self.progress_string() + + try: + las_bandwidth_str = res[res.index(['Bandwidth:']) + 2][0] + except ValueError: + LOGGER.warning("Failed to extract Load&Stream bandwidth from the sctool restore progress output." + "Check Manager version, bandwidth metrics are supported starting from 3.4.") + return None + + las_bandwidth_match = re.search(r"(\d+\.\d+)", las_bandwidth_str) + return float(las_bandwidth_match.group(1)) + @property def post_restore_repair_duration(self) -> datetime.timedelta: """Restore task consists of two parts and includes two duration marks: