From 318be432fd1e7bbdb780edfa3f7e2c2458840646 Mon Sep 17 00:00:00 2001 From: Lukasz Sojka Date: Sun, 6 Oct 2024 15:24:30 +0200 Subject: [PATCH] improvement(results): store cycle's 'start time' To ease investigation of perf regressions, latency decorator results should contain `start time` of each cycle. Made `latency decorator` to send this information to Argus and present it in Results tab. closes: https://github.com/scylladb/argus/issues/455 --- sdcm/argus_results.py | 23 +++++++++++++++++++---- sdcm/utils/decorators.py | 6 ++++-- unit_tests/test_argus_results.py | 6 ++++-- utils/migrate_latency_results_to_argus.py | 9 +++++++-- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/sdcm/argus_results.py b/sdcm/argus_results.py index c3fc057078..80dd8ae9fa 100644 --- a/sdcm/argus_results.py +++ b/sdcm/argus_results.py @@ -11,6 +11,8 @@ # # Copyright (c) 2024 ScyllaDB import json +import time +from datetime import timezone, datetime from argus.client import ArgusClient from argus.client.generic_result import GenericResultTable, ColumnMetadata, ResultType, Status, ValidationRule @@ -41,8 +43,10 @@ class Meta: ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True), ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True), ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False), + # help jump into proper place in logs/monitoring + ColumnMetadata(name="start time", unit="", type=ResultType.TEXT), ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT), - ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT) + ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT), ] @@ -55,8 +59,10 @@ class Meta: ColumnMetadata(name="P99 write", unit="ms", type=ResultType.FLOAT, higher_is_better=False), ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True), ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False), + # help jump into proper place in logs/monitoring + ColumnMetadata(name="start time", unit="", type=ResultType.TEXT), ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT), - ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT) + ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT), ] @@ -69,8 +75,10 @@ class Meta: ColumnMetadata(name="P99 read", unit="ms", type=ResultType.FLOAT, higher_is_better=False), ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True), ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False), + # help jump into proper place in logs/monitoring + ColumnMetadata(name="start time", unit="", type=ResultType.TEXT), ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT), - ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT) + ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT), ] @@ -110,11 +118,16 @@ class Meta: } -def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict): +def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict, + start_time: float = 0): result_table = workload_to_table[workload]() result_table.name = f"{workload} - {name} - latencies" result_table.description = f"{workload} workload - {description}" operation_error_thresholds = LATENCY_ERROR_THRESHOLDS.get(name, LATENCY_ERROR_THRESHOLDS["default"]) + try: + start_time = datetime.fromtimestamp(start_time or time.time(), tz=timezone.utc).strftime('%H:%M:%S') + except ValueError: + start_time = "N/A" for operation in ["write", "read"]: summary = result["hdr_summary"] if operation.upper() not in summary: @@ -147,6 +160,8 @@ def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, de value=qa_screenshot, status=Status.UNSET) except IndexError: pass + result_table.add_result(column="start time", row=f"Cycle #{cycle}", + value=start_time, status=Status.UNSET) argus_client.submit_results(result_table) for event in result["reactor_stalls_stats"]: # each stall event has own table event_name = event.split(".")[-1] diff --git a/sdcm/utils/decorators.py b/sdcm/utils/decorators.py index 6dcf61a526..f1c327e870 100644 --- a/sdcm/utils/decorators.py +++ b/sdcm/utils/decorators.py @@ -264,7 +264,8 @@ def wrapped(*args, **kwargs): # noqa: PLR0914 name="Steady State", description="Latencies without any operation running", cycle=0, - result=result + result=result, + start_time=start, ) else: latency_results[func_name]['cycles'].append(result) @@ -274,7 +275,8 @@ def wrapped(*args, **kwargs): # noqa: PLR0914 name=f"{func_name}", description=legend or "", cycle=len(latency_results[func_name]['cycles']), - result=result + result=result, + start_time=start, ) with open(latency_results_file_path, 'w', encoding="utf-8") as file: diff --git a/unit_tests/test_argus_results.py b/unit_tests/test_argus_results.py index a364384c9b..4a16306f41 100644 --- a/unit_tests/test_argus_results.py +++ b/unit_tests/test_argus_results.py @@ -30,7 +30,8 @@ def test_send_latency_decorator_result_to_argus(): name="test", description="test", cycle=1, - result=result + result=result, + start_time=1721564063.4528425 ) expected_calls = [ call(LatencyCalculatorMixedResult( @@ -50,7 +51,8 @@ def test_send_latency_decorator_result_to_argus(): value='https://cloudius-jenkins-test.s3.amazonaws.com/a9b9a308-6ff8-4cc8-b33d-c439f75c9949/20240721_125838/' 'grafana-screenshot-scylla-master-perf-regression-latency-650gb-grow-shrink-scylla-per-server-metrics-nemesis' '-20240721_125845-perf-latency-grow-shrink-ubuntu-monitor-node-a9b9a308-1.png', - status=Status.UNSET) + status=Status.UNSET), + Cell(column='start time', row='Cycle #1', value='12:14:23', status=Status.UNSET) ] )), call(ReactorStallStatsResult( diff --git a/utils/migrate_latency_results_to_argus.py b/utils/migrate_latency_results_to_argus.py index 92a65bf1d9..3d9d843b32 100755 --- a/utils/migrate_latency_results_to_argus.py +++ b/utils/migrate_latency_results_to_argus.py @@ -81,13 +81,17 @@ def sort_func(item): print(f"Would send {operation} - {workload} - latencies - cycle 0 to Argus") continue try: + start_time = result["hdr_summary"].get("READ", result["hdr_summary"].get("WRITE"))[ + "start_time"] / 1000 send_result_to_argus(argus_client=client, workload=workload, name=operation, - description=description, cycle=0, result=result) + description=description, cycle=0, result=result, start_time=start_time) except argus.client.base.ArgusClientError: print( f"Failed to send {operation} - {workload} - latencies to Argus: {hit['_source']['test_details']['job_url']}") continue for idx, cycle in enumerate(latency_during_ops[operation]["cycles"], start=1): + start_time = cycle["hdr_summary"].get("READ", cycle["hdr_summary"].get("WRITE"))["start_time"] + start_time = start_time if start_time < 1000000000000 else start_time / 1000 if dry_run: print(f"Would send {operation} - {workload} - latencies - cycle {idx} to Argus") continue @@ -98,7 +102,8 @@ def sort_func(item): name=operation, description=latency_during_ops[operation]["legend"] or "", cycle=idx, - result=cycle + result=cycle, + start_time=start_time ) except argus.client.base.ArgusClientError: print(