From 318be432fd1e7bbdb780edfa3f7e2c2458840646 Mon Sep 17 00:00:00 2001
From: Lukasz Sojka <lukasz.sojka@scylladb.com>
Date: Sun, 6 Oct 2024 15:24:30 +0200
Subject: [PATCH] improvement(results): store cycle's 'start time'

To ease investigation of perf regressions, latency decorator results
should contain `start time` of each cycle.

Made `latency decorator` to send this information to Argus and present
it in Results tab.

closes: https://github.com/scylladb/argus/issues/455
---
 sdcm/argus_results.py                     | 23 +++++++++++++++++++----
 sdcm/utils/decorators.py                  |  6 ++++--
 unit_tests/test_argus_results.py          |  6 ++++--
 utils/migrate_latency_results_to_argus.py |  9 +++++++--
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/sdcm/argus_results.py b/sdcm/argus_results.py
index c3fc057078..80dd8ae9fa 100644
--- a/sdcm/argus_results.py
+++ b/sdcm/argus_results.py
@@ -11,6 +11,8 @@
 #
 # Copyright (c) 2024 ScyllaDB
 import json
+import time
+from datetime import timezone, datetime
 
 from argus.client import ArgusClient
 from argus.client.generic_result import GenericResultTable, ColumnMetadata, ResultType, Status, ValidationRule
@@ -41,8 +43,10 @@ class Meta:
             ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
             ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
             ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
+            # help jump into proper place in logs/monitoring
+            ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
             ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
-            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
+            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
         ]
 
 
@@ -55,8 +59,10 @@ class Meta:
             ColumnMetadata(name="P99 write", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
             ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
             ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
+            # help jump into proper place in logs/monitoring
+            ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
             ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
-            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
+            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
         ]
 
 
@@ -69,8 +75,10 @@ class Meta:
             ColumnMetadata(name="P99 read", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
             ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
             ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
+            # help jump into proper place in logs/monitoring
+            ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
             ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
-            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
+            ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
         ]
 
 
@@ -110,11 +118,16 @@ class Meta:
 }
 
 
-def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict):
+def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict,
+                         start_time: float = 0):
     result_table = workload_to_table[workload]()
     result_table.name = f"{workload} - {name} - latencies"
     result_table.description = f"{workload} workload - {description}"
     operation_error_thresholds = LATENCY_ERROR_THRESHOLDS.get(name, LATENCY_ERROR_THRESHOLDS["default"])
+    try:
+        start_time = datetime.fromtimestamp(start_time or time.time(), tz=timezone.utc).strftime('%H:%M:%S')
+    except ValueError:
+        start_time = "N/A"
     for operation in ["write", "read"]:
         summary = result["hdr_summary"]
         if operation.upper() not in summary:
@@ -147,6 +160,8 @@ def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, de
                                 value=qa_screenshot, status=Status.UNSET)
     except IndexError:
         pass
+    result_table.add_result(column="start time", row=f"Cycle #{cycle}",
+                            value=start_time, status=Status.UNSET)
     argus_client.submit_results(result_table)
     for event in result["reactor_stalls_stats"]:  # each stall event has own table
         event_name = event.split(".")[-1]
diff --git a/sdcm/utils/decorators.py b/sdcm/utils/decorators.py
index 6dcf61a526..f1c327e870 100644
--- a/sdcm/utils/decorators.py
+++ b/sdcm/utils/decorators.py
@@ -264,7 +264,8 @@ def wrapped(*args, **kwargs):  # noqa: PLR0914
                         name="Steady State",
                         description="Latencies without any operation running",
                         cycle=0,
-                        result=result
+                        result=result,
+                        start_time=start,
                     )
             else:
                 latency_results[func_name]['cycles'].append(result)
@@ -274,7 +275,8 @@ def wrapped(*args, **kwargs):  # noqa: PLR0914
                     name=f"{func_name}",
                     description=legend or "",
                     cycle=len(latency_results[func_name]['cycles']),
-                    result=result
+                    result=result,
+                    start_time=start,
                 )
 
             with open(latency_results_file_path, 'w', encoding="utf-8") as file:
diff --git a/unit_tests/test_argus_results.py b/unit_tests/test_argus_results.py
index a364384c9b..4a16306f41 100644
--- a/unit_tests/test_argus_results.py
+++ b/unit_tests/test_argus_results.py
@@ -30,7 +30,8 @@ def test_send_latency_decorator_result_to_argus():
         name="test",
         description="test",
         cycle=1,
-        result=result
+        result=result,
+        start_time=1721564063.4528425
     )
     expected_calls = [
         call(LatencyCalculatorMixedResult(
@@ -50,7 +51,8 @@ def test_send_latency_decorator_result_to_argus():
                      value='https://cloudius-jenkins-test.s3.amazonaws.com/a9b9a308-6ff8-4cc8-b33d-c439f75c9949/20240721_125838/'
                            'grafana-screenshot-scylla-master-perf-regression-latency-650gb-grow-shrink-scylla-per-server-metrics-nemesis'
                            '-20240721_125845-perf-latency-grow-shrink-ubuntu-monitor-node-a9b9a308-1.png',
-                     status=Status.UNSET)
+                     status=Status.UNSET),
+                Cell(column='start time', row='Cycle #1', value='12:14:23', status=Status.UNSET)
             ]
         )),
         call(ReactorStallStatsResult(
diff --git a/utils/migrate_latency_results_to_argus.py b/utils/migrate_latency_results_to_argus.py
index 92a65bf1d9..3d9d843b32 100755
--- a/utils/migrate_latency_results_to_argus.py
+++ b/utils/migrate_latency_results_to_argus.py
@@ -81,13 +81,17 @@ def sort_func(item):
                     print(f"Would send {operation} - {workload} - latencies - cycle 0 to Argus")
                     continue
                 try:
+                    start_time = result["hdr_summary"].get("READ", result["hdr_summary"].get("WRITE"))[
+                        "start_time"] / 1000
                     send_result_to_argus(argus_client=client, workload=workload, name=operation,
-                                         description=description, cycle=0, result=result)
+                                         description=description, cycle=0, result=result, start_time=start_time)
                 except argus.client.base.ArgusClientError:
                     print(
                         f"Failed to send {operation} - {workload} - latencies to Argus: {hit['_source']['test_details']['job_url']}")
                 continue
             for idx, cycle in enumerate(latency_during_ops[operation]["cycles"], start=1):
+                start_time = cycle["hdr_summary"].get("READ", cycle["hdr_summary"].get("WRITE"))["start_time"]
+                start_time = start_time if start_time < 1000000000000 else start_time / 1000
                 if dry_run:
                     print(f"Would send {operation} - {workload} - latencies - cycle {idx} to Argus")
                     continue
@@ -98,7 +102,8 @@ def sort_func(item):
                         name=operation,
                         description=latency_during_ops[operation]["legend"] or "",
                         cycle=idx,
-                        result=cycle
+                        result=cycle,
+                        start_time=start_time
                     )
                 except argus.client.base.ArgusClientError:
                     print(