Skip to content

Commit

Permalink
improvement(results): store cycle's 'start time'
Browse files Browse the repository at this point in the history
To ease investigation of perf regressions, latency decorator results
should contain `start time` of each cycle.

Made `latency decorator` to send this information to Argus and present
it in Results tab.

closes: scylladb/argus#455
  • Loading branch information
soyacz authored and fruch committed Oct 7, 2024
1 parent d3b56f6 commit 318be43
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 10 deletions.
23 changes: 19 additions & 4 deletions sdcm/argus_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#
# Copyright (c) 2024 ScyllaDB
import json
import time
from datetime import timezone, datetime

from argus.client import ArgusClient
from argus.client.generic_result import GenericResultTable, ColumnMetadata, ResultType, Status, ValidationRule
Expand Down Expand Up @@ -41,8 +43,10 @@ class Meta:
ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
# help jump into proper place in logs/monitoring
ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
]


Expand All @@ -55,8 +59,10 @@ class Meta:
ColumnMetadata(name="P99 write", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
ColumnMetadata(name="Throughput write", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
# help jump into proper place in logs/monitoring
ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
]


Expand All @@ -69,8 +75,10 @@ class Meta:
ColumnMetadata(name="P99 read", unit="ms", type=ResultType.FLOAT, higher_is_better=False),
ColumnMetadata(name="Throughput read", unit="op/s", type=ResultType.INTEGER, higher_is_better=True),
ColumnMetadata(name="duration", unit="HH:MM:SS", type=ResultType.DURATION, higher_is_better=False),
# help jump into proper place in logs/monitoring
ColumnMetadata(name="start time", unit="", type=ResultType.TEXT),
ColumnMetadata(name="Overview", unit="", type=ResultType.TEXT),
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT)
ColumnMetadata(name="QA dashboard", unit="", type=ResultType.TEXT),
]


Expand Down Expand Up @@ -110,11 +118,16 @@ class Meta:
}


def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict):
def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict,
start_time: float = 0):
result_table = workload_to_table[workload]()
result_table.name = f"{workload} - {name} - latencies"
result_table.description = f"{workload} workload - {description}"
operation_error_thresholds = LATENCY_ERROR_THRESHOLDS.get(name, LATENCY_ERROR_THRESHOLDS["default"])
try:
start_time = datetime.fromtimestamp(start_time or time.time(), tz=timezone.utc).strftime('%H:%M:%S')
except ValueError:
start_time = "N/A"
for operation in ["write", "read"]:
summary = result["hdr_summary"]
if operation.upper() not in summary:
Expand Down Expand Up @@ -147,6 +160,8 @@ def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, de
value=qa_screenshot, status=Status.UNSET)
except IndexError:
pass
result_table.add_result(column="start time", row=f"Cycle #{cycle}",
value=start_time, status=Status.UNSET)
argus_client.submit_results(result_table)
for event in result["reactor_stalls_stats"]: # each stall event has own table
event_name = event.split(".")[-1]
Expand Down
6 changes: 4 additions & 2 deletions sdcm/utils/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ def wrapped(*args, **kwargs): # noqa: PLR0914
name="Steady State",
description="Latencies without any operation running",
cycle=0,
result=result
result=result,
start_time=start,
)
else:
latency_results[func_name]['cycles'].append(result)
Expand All @@ -274,7 +275,8 @@ def wrapped(*args, **kwargs): # noqa: PLR0914
name=f"{func_name}",
description=legend or "",
cycle=len(latency_results[func_name]['cycles']),
result=result
result=result,
start_time=start,
)

with open(latency_results_file_path, 'w', encoding="utf-8") as file:
Expand Down
6 changes: 4 additions & 2 deletions unit_tests/test_argus_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def test_send_latency_decorator_result_to_argus():
name="test",
description="test",
cycle=1,
result=result
result=result,
start_time=1721564063.4528425
)
expected_calls = [
call(LatencyCalculatorMixedResult(
Expand All @@ -50,7 +51,8 @@ def test_send_latency_decorator_result_to_argus():
value='https://cloudius-jenkins-test.s3.amazonaws.com/a9b9a308-6ff8-4cc8-b33d-c439f75c9949/20240721_125838/'
'grafana-screenshot-scylla-master-perf-regression-latency-650gb-grow-shrink-scylla-per-server-metrics-nemesis'
'-20240721_125845-perf-latency-grow-shrink-ubuntu-monitor-node-a9b9a308-1.png',
status=Status.UNSET)
status=Status.UNSET),
Cell(column='start time', row='Cycle #1', value='12:14:23', status=Status.UNSET)
]
)),
call(ReactorStallStatsResult(
Expand Down
9 changes: 7 additions & 2 deletions utils/migrate_latency_results_to_argus.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,17 @@ def sort_func(item):
print(f"Would send {operation} - {workload} - latencies - cycle 0 to Argus")
continue
try:
start_time = result["hdr_summary"].get("READ", result["hdr_summary"].get("WRITE"))[
"start_time"] / 1000
send_result_to_argus(argus_client=client, workload=workload, name=operation,
description=description, cycle=0, result=result)
description=description, cycle=0, result=result, start_time=start_time)
except argus.client.base.ArgusClientError:
print(
f"Failed to send {operation} - {workload} - latencies to Argus: {hit['_source']['test_details']['job_url']}")
continue
for idx, cycle in enumerate(latency_during_ops[operation]["cycles"], start=1):
start_time = cycle["hdr_summary"].get("READ", cycle["hdr_summary"].get("WRITE"))["start_time"]
start_time = start_time if start_time < 1000000000000 else start_time / 1000
if dry_run:
print(f"Would send {operation} - {workload} - latencies - cycle {idx} to Argus")
continue
Expand All @@ -98,7 +102,8 @@ def sort_func(item):
name=operation,
description=latency_during_ops[operation]["legend"] or "",
cycle=idx,
result=cycle
result=cycle,
start_time=start_time
)
except argus.client.base.ArgusClientError:
print(
Expand Down

0 comments on commit 318be43

Please sign in to comment.