-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SARC-330] Implémenter les alertes : Proportion de jobs GPU avec stat…
…s prometheus spécifique aux GPUs sur un noeud donné plus bas qu’un threshold X
- Loading branch information
1 parent
bab50a5
commit 273ef0e
Showing
11 changed files
with
223 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import functools | ||
import re | ||
|
||
import pytest | ||
|
||
from sarc.alerts.usage_alerts.prometheus_stats_occurrences import ( | ||
check_prometheus_stats_for_gpu_jobs, | ||
) | ||
from sarc.client import get_jobs | ||
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME | ||
|
||
from ..jobs.test_func_job_statistics import generate_fake_timeseries | ||
|
||
PARAMS = { | ||
# Check with default params. In last 7 days from now (mock time: 2023-11-22), | ||
# there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats. | ||
# So threshold will be 0 everywhere, and no warning will be printed. | ||
"default": dict(), | ||
# Check with no time_interval. | ||
"no_time_interval": dict(time_interval=None), | ||
# Check with no time_interval and low amount of stddev (0.25). | ||
"std_025": dict(time_interval=None, nb_stddev=0.25), | ||
# Check with no time_interval, 0.25 stddev, and 1 extra cluster. | ||
# Expected 1 more warning, no other changes . | ||
"std_025_clusters_extra": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
cluster_names=[ | ||
"raisin", | ||
"patate", | ||
"fromage", | ||
"mila", | ||
"invisible-cluster", | ||
], | ||
), | ||
# Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change. | ||
"std_025_clusters_2": dict( | ||
time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"] | ||
), | ||
# Check with no time_interval, 0.25 stddev, and no group_by_node. | ||
"std_025_group_none": dict( | ||
time_interval=None, nb_stddev=0.25, group_by_node=() | ||
), | ||
# Check with no time_interval, 0.25 stddev, and group_by_node for all clusters. | ||
# Sams as if group_by_node is not specified, as only `raisin` triggers some warnings. | ||
"std_025_group_full": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
), | ||
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2. | ||
"std_025_group_full_min_jobs_2": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
min_jobs_per_group=2, | ||
), | ||
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, | ||
# and min jobs set to 2 for only `raisin`. | ||
# No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster. | ||
"std_025_group_full_min_jobs_raisin": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
min_jobs_per_group={"raisin": 3}, | ||
), | ||
} | ||
|
||
|
||
@pytest.mark.freeze_time(MOCK_TIME) | ||
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") | ||
@pytest.mark.parametrize( | ||
"params", | ||
PARAMS.values(), | ||
ids=PARAMS.keys() | ||
) | ||
def test_check_prometheus_stats_for_gpu_jobs( | ||
params, monkeypatch, caplog, file_regression | ||
): | ||
monkeypatch.setattr( | ||
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries | ||
) | ||
|
||
for job in get_jobs(): | ||
job.statistics(save=True) | ||
check_prometheus_stats_for_gpu_jobs(**params) | ||
file_regression.check( | ||
re.sub( | ||
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +", | ||
"", | ||
caplog.text, | ||
) | ||
) |
Empty file.
7 changes: 7 additions & 0 deletions
7
...metheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 |
7 changes: 7 additions & 0 deletions
7
.../test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 |
7 changes: 7 additions & 0 deletions
7
...theus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 |
8 changes: 8 additions & 0 deletions
8
...s_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[invisible-cluster] no Prometheus data available: no job found |
7 changes: 7 additions & 0 deletions
7
...theus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 |
Oops, something went wrong.