diff --git a/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py b/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py index 5ebf4c41..7acdcbb1 100644 --- a/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py +++ b/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py @@ -20,14 +20,17 @@ def __init__(self, name): self.threshold = None +# pylint: disable=too-many-branches def check_prometheus_stats_occurrences( time_interval: Optional[timedelta] = timedelta(days=7), time_unit=timedelta(days=1), minimum_runtime: Optional[timedelta] = timedelta(minutes=5), cluster_names: Optional[List[str]] = None, - group_by_node: Optional[Sequence[str]] = ("mila",), + group_by_node: Union[bool, Sequence[str]] = ("mila",), min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, nb_stddev=2, + with_gres_gpu=False, + prometheus_stats=("cpu_utilization", "system_memory"), ): """ Check if we have scrapped Prometheus stats for enough jobs per node per cluster per time unit. @@ -56,8 +59,10 @@ def check_prometheus_stats_occurrences( If a cluster in this list does not appear in jobs, a warning will be logged. If empty (or not specified), use all clusters available among jobs retrieved with time_interval. - group_by_node: Sequence - Optional sequence of clusters to group by node. + group_by_node: Sequence | bool + Either a sequence of clusters to group by node, + or False to indicate no cluster to group by node (equivalent to empty sequence), + or True to indicate that all clusters must be grouped by node. For clusters in this list, we will check each node separately (ie. a "group" is a cluster node). By default, we check the entire cluster (i.e. the "group" is the cluster itself). min_jobs_per_group: int | dict @@ -71,6 +76,11 @@ def check_prometheus_stats_occurrences( Amount of standard deviation to remove from average statistics to compute checking threshold. Threshold is computed as: max(0, average - nb_stddev * stddev) + with_gres_gpu: bool + If True, check only jobs which have allocated.gres_gpu > 0 (GPU jobs) + If False (default), check only jobs which have allocated.gres_gpu == 0 (CPU jobs). + prometheus_stats: Sequence[str] + Prometheus stats to check. Default: "cpu_utilization", "system_memory" """ # Parse time_interval and get data frame @@ -81,24 +91,41 @@ def check_prometheus_stats_occurrences( clip_time = True df = load_job_series(start=start, end=end, clip_time=clip_time) - # Parse minimum_runtime, and select only jobs where - # elapsed time >= minimum runtime and allocated.gres_gpu == 0 + # Parse minimum_runtime if minimum_runtime is None: minimum_runtime = timedelta(seconds=0) - df = df[ - (df["elapsed_time"] >= minimum_runtime.total_seconds()) - & (df["allocated.gres_gpu"] == 0) - ] + # Select only jobs where elapsed time >= minimum runtime and + # jobs are GPU or CPU jobs, depending on `with_gres_gpu` + selection_elapsed_time = df["elapsed_time"] >= minimum_runtime.total_seconds() + selection_gres_gpu = ( + (df["allocated.gres_gpu"] > 0) + if with_gres_gpu + else (df["allocated.gres_gpu"] == 0) + ) + df = df[selection_elapsed_time & selection_gres_gpu] # List clusters cluster_names = cluster_names or sorted(df["cluster_name"].unique()) + # If df is empty, warn for each cluster that we can't check Prometheus stats. + if df.empty: + for cluster_name in cluster_names: + logger.warning( + f"[{cluster_name}] no Prometheus data available: no job found" + ) + # As there's nothing to check, we return immediately. + return + # Split data frame into time frames using `time_unit` df = compute_time_frames(df, frame_size=time_unit) # Duplicates lines per node to count each job for each node where it runs df = df.explode("nodes") + # parse group_by_node + if isinstance(group_by_node, bool): + group_by_node = list(df["cluster_name"].unique()) if group_by_node else () + # If cluster not in group_by_node, # then we must count jobs for the entire cluster, not per node. # To simplify the code, let's just define 1 common node for all cluster jobs @@ -109,14 +136,13 @@ def check_prometheus_stats_occurrences( df.loc[:, "task_"] = 1 # Generate Prometheus context for each Prometheus stat we want to check. - prom_contexts = [ - PrometheusStatInfo(name=prom_col) - for prom_col in ["cpu_utilization", "system_memory"] - ] + prom_contexts = [PrometheusStatInfo(name=prom_col) for prom_col in prometheus_stats] # Add columns to check if job has prometheus stats for prom in prom_contexts: - df.loc[:, prom.col_has] = ~df[prom.name].isnull() + # NB: Use DataFrame.reindex() to add column with NaN values if missing: + # (2024/09/26) https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html + df.loc[:, prom.col_has] = ~(df.reindex(columns=[prom.name])[prom.name].isnull()) # Group per timestamp per cluster per node, and count jobs and prometheus stats. # If "cluster_names" are given, use only jobs in these clusters. @@ -175,3 +201,43 @@ def check_prometheus_stats_occurrences( logger.warning( f"[{cluster_name}] no Prometheus data available: no job found" ) + + +def check_prometheus_stats_for_gpu_jobs( + time_interval: Optional[timedelta] = timedelta(days=7), + time_unit=timedelta(days=1), + minimum_runtime: Optional[timedelta] = timedelta(minutes=5), + cluster_names: Optional[List[str]] = None, + # For GPU jobs, default behaviour is to group each cluster by nodes for checking. + group_by_node: Union[bool, Sequence[str]] = True, + min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, + nb_stddev=2, +): + """ + Check if we have scrapped Prometheus stats for enough GPU jobs per node per cluster per time unit. + Log a warning for each node / cluster where ratio of GPU jobs with Prometheus stats is lower than + a threshold computed using mean and standard deviation statistics from all clusters. + + To get more info about parameters, see documentation for `check_prometheus_stats_occurrences`. + """ + return check_prometheus_stats_occurrences( + time_interval=time_interval, + time_unit=time_unit, + minimum_runtime=minimum_runtime, + cluster_names=cluster_names, + group_by_node=group_by_node, + min_jobs_per_group=min_jobs_per_group, + nb_stddev=nb_stddev, + # We are looking for GPU jobs + with_gres_gpu=True, + # We are looking for GPU-related Prometheus stats + prometheus_stats=( + "gpu_utilization", + "gpu_utilization_fp16", + "gpu_utilization_fp32", + "gpu_utilization_fp64", + "gpu_sm_occupancy", + "gpu_memory", + "gpu_power", + ), + ) diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py new file mode 100644 index 00000000..7c0fe321 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py @@ -0,0 +1,93 @@ +import functools +import re + +import pytest + +from sarc.alerts.usage_alerts.prometheus_stats_occurrences import ( + check_prometheus_stats_for_gpu_jobs, +) +from sarc.client import get_jobs +from tests.functional.jobs.test_func_load_job_series import MOCK_TIME + +from ..jobs.test_func_job_statistics import generate_fake_timeseries + +PARAMS = { + # Check with default params. In last 7 days from now (mock time: 2023-11-22), + # there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats. + # So threshold will be 0 everywhere, and no warning will be printed. + "default": dict(), + # Check with no time_interval. + "no_time_interval": dict(time_interval=None), + # Check with no time_interval and low amount of stddev (0.25). + "std_025": dict(time_interval=None, nb_stddev=0.25), + # Check with no time_interval, 0.25 stddev, and 1 extra cluster. + # Expected 1 more warning, no other changes . + "std_025_clusters_extra": dict( + time_interval=None, + nb_stddev=0.25, + cluster_names=[ + "raisin", + "patate", + "fromage", + "mila", + "invisible-cluster", + ], + ), + # Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change. + "std_025_clusters_2": dict( + time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"] + ), + # Check with no time_interval, 0.25 stddev, and no group_by_node. + "std_025_group_none": dict( + time_interval=None, nb_stddev=0.25, group_by_node=() + ), + # Check with no time_interval, 0.25 stddev, and group_by_node for all clusters. + # Sams as if group_by_node is not specified, as only `raisin` triggers some warnings. + "std_025_group_full": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + ), + # Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2. + "std_025_group_full_min_jobs_2": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + min_jobs_per_group=2, + ), + # Check with no time_interval, 0.25 stddev, group_by_node for all clusters, + # and min jobs set to 2 for only `raisin`. + # No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster. + "std_025_group_full_min_jobs_raisin": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + min_jobs_per_group={"raisin": 3}, + ), +} + + +@pytest.mark.freeze_time(MOCK_TIME) +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +@pytest.mark.parametrize( + "params", + PARAMS.values(), + ids=PARAMS.keys() +) +def test_check_prometheus_stats_for_gpu_jobs( + params, monkeypatch, caplog, file_regression +): + monkeypatch.setattr( + "sarc.jobs.series.get_job_time_series", generate_fake_timeseries + ) + + for job in get_jobs(): + job.statistics(save=True) + check_prometheus_stats_for_gpu_jobs(**params) + file_regression.check( + re.sub( + r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +", + "", + caplog.text, + ) + ) diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_default_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_default_.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt new file mode 100644 index 00000000..bc2623a9 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt new file mode 100644 index 00000000..e498ec5f --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt new file mode 100644 index 00000000..f8dd1246 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt @@ -0,0 +1,8 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[invisible-cluster] no Prometheus data available: no job found diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_raisin_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_raisin_.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt new file mode 100644 index 00000000..dae24a82 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00