Skip to content

Commit

Permalink
[SARC-330] Implémenter les alertes : Proportion de jobs GPU avec stat…
Browse files Browse the repository at this point in the history
…s prometheus spécifique aux GPUs sur un noeud donné plus bas qu’un threshold X
  • Loading branch information
notoraptor committed Sep 27, 2024
1 parent bab50a5 commit d47872f
Show file tree
Hide file tree
Showing 12 changed files with 218 additions and 14 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ disable = [
"line-too-long", # Black takes care of line length.
"logging-fstring-interpolation",
"duplicate-code",
"too-many-positional-arguments",
]
extension-pkg-whitelist = "pydantic"

Expand Down
94 changes: 80 additions & 14 deletions sarc/alerts/usage_alerts/prometheus_stats_occurrences.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ def __init__(self, name):
self.threshold = None


# pylint: disable=too-many-branches
def check_prometheus_stats_occurrences(
time_interval: Optional[timedelta] = timedelta(days=7),
time_unit=timedelta(days=1),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
cluster_names: Optional[List[str]] = None,
group_by_node: Optional[Sequence[str]] = ("mila",),
group_by_node: Union[bool, Sequence[str]] = ("mila",),
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None,
nb_stddev=2,
with_gres_gpu=False,
prometheus_stats=("cpu_utilization", "system_memory"),
):
"""
Check if we have scrapped Prometheus stats for enough jobs per node per cluster per time unit.
Expand Down Expand Up @@ -56,8 +59,10 @@ def check_prometheus_stats_occurrences(
If a cluster in this list does not appear in jobs, a warning will be logged.
If empty (or not specified), use all clusters available among jobs retrieved with time_interval.
group_by_node: Sequence
Optional sequence of clusters to group by node.
group_by_node: Sequence | bool
Either a sequence of clusters to group by node,
or False to indicate no cluster to group by node (equivalent to empty sequence),
or True to indicate that all clusters must be grouped by node.
For clusters in this list, we will check each node separately (ie. a "group" is a cluster node).
By default, we check the entire cluster (i.e. the "group" is the cluster itself).
min_jobs_per_group: int | dict
Expand All @@ -71,6 +76,11 @@ def check_prometheus_stats_occurrences(
Amount of standard deviation to remove from average statistics to compute checking threshold.
Threshold is computed as:
max(0, average - nb_stddev * stddev)
with_gres_gpu: bool
If True, check only jobs which have allocated.gres_gpu > 0 (GPU jobs)
If False (default), check only jobs which have allocated.gres_gpu == 0 (CPU jobs).
prometheus_stats: Sequence[str]
Prometheus stats to check. Default: "cpu_utilization", "system_memory"
"""

# Parse time_interval and get data frame
Expand All @@ -81,24 +91,41 @@ def check_prometheus_stats_occurrences(
clip_time = True
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Parse minimum_runtime, and select only jobs where
# elapsed time >= minimum runtime and allocated.gres_gpu == 0
# Parse minimum_runtime
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)
df = df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] == 0)
]
# Select only jobs where elapsed time >= minimum runtime and
# jobs are GPU or CPU jobs, depending on `with_gres_gpu`
selection_elapsed_time = df["elapsed_time"] >= minimum_runtime.total_seconds()
selection_gres_gpu = (
(df["allocated.gres_gpu"] > 0)
if with_gres_gpu
else (df["allocated.gres_gpu"] == 0)
)
df = df[selection_elapsed_time & selection_gres_gpu]

# List clusters
cluster_names = cluster_names or sorted(df["cluster_name"].unique())

# If df is empty, warn for each cluster that we can't check Prometheus stats.
if df.empty:
for cluster_name in cluster_names:
logger.warning(
f"[{cluster_name}] no Prometheus data available: no job found"
)
# As there's nothing to check, we return immediately.
return

# Split data frame into time frames using `time_unit`
df = compute_time_frames(df, frame_size=time_unit)

# Duplicates lines per node to count each job for each node where it runs
df = df.explode("nodes")

# parse group_by_node
if isinstance(group_by_node, bool):
group_by_node = list(df["cluster_name"].unique()) if group_by_node else ()

# If cluster not in group_by_node,
# then we must count jobs for the entire cluster, not per node.
# To simplify the code, let's just define 1 common node for all cluster jobs
Expand All @@ -109,14 +136,13 @@ def check_prometheus_stats_occurrences(
df.loc[:, "task_"] = 1

# Generate Prometheus context for each Prometheus stat we want to check.
prom_contexts = [
PrometheusStatInfo(name=prom_col)
for prom_col in ["cpu_utilization", "system_memory"]
]
prom_contexts = [PrometheusStatInfo(name=prom_col) for prom_col in prometheus_stats]

# Add columns to check if job has prometheus stats
for prom in prom_contexts:
df.loc[:, prom.col_has] = ~df[prom.name].isnull()
# NB: Use DataFrame.reindex() to add column with NaN values if missing:
# (2024/09/26) https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html
df.loc[:, prom.col_has] = ~(df.reindex(columns=[prom.name])[prom.name].isnull())

# Group per timestamp per cluster per node, and count jobs and prometheus stats.
# If "cluster_names" are given, use only jobs in these clusters.
Expand Down Expand Up @@ -175,3 +201,43 @@ def check_prometheus_stats_occurrences(
logger.warning(
f"[{cluster_name}] no Prometheus data available: no job found"
)


def check_prometheus_stats_for_gpu_jobs(
time_interval: Optional[timedelta] = timedelta(days=7),
time_unit=timedelta(days=1),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
cluster_names: Optional[List[str]] = None,
# For GPU jobs, default behaviour is to group each cluster by nodes for checking.
group_by_node: Union[bool, Sequence[str]] = True,
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None,
nb_stddev=2,
):
"""
Check if we have scrapped Prometheus stats for enough GPU jobs per node per cluster per time unit.
Log a warning for each node / cluster where ratio of GPU jobs with Prometheus stats is lower than
a threshold computed using mean and standard deviation statistics from all clusters.
To get more info about parameters, see documentation for `check_prometheus_stats_occurrences`.
"""
return check_prometheus_stats_occurrences(
time_interval=time_interval,
time_unit=time_unit,
minimum_runtime=minimum_runtime,
cluster_names=cluster_names,
group_by_node=group_by_node,
min_jobs_per_group=min_jobs_per_group,
nb_stddev=nb_stddev,
# We are looking for GPU jobs
with_gres_gpu=True,
# We are looking for GPU-related Prometheus stats
prometheus_stats=(
"gpu_utilization",
"gpu_utilization_fp16",
"gpu_utilization_fp32",
"gpu_utilization_fp64",
"gpu_sm_occupancy",
"gpu_memory",
"gpu_power",
),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import functools
import re

import pytest

from sarc.alerts.usage_alerts.prometheus_stats_occurrences import (
check_prometheus_stats_for_gpu_jobs,
)
from sarc.client import get_jobs
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from ..jobs.test_func_job_statistics import generate_fake_timeseries

PARAMS = {
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats.
# So threshold will be 0 everywhere, and no warning will be printed.
"default": dict(),
# Check with no time_interval.
"no_time_interval": dict(time_interval=None),
# Check with no time_interval and low amount of stddev (0.25).
"std_025": dict(time_interval=None, nb_stddev=0.25),
# Check with no time_interval, 0.25 stddev, and 1 extra cluster.
# Expected 1 more warning, no other changes .
"std_025_clusters_extra": dict(
time_interval=None,
nb_stddev=0.25,
cluster_names=[
"raisin",
"patate",
"fromage",
"mila",
"invisible-cluster",
],
),
# Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change.
"std_025_clusters_2": dict(
time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"]
),
# Check with no time_interval, 0.25 stddev, and no group_by_node.
"std_025_group_none": dict(time_interval=None, nb_stddev=0.25, group_by_node=()),
# Check with no time_interval, 0.25 stddev, and group_by_node for all clusters.
# Sams as if group_by_node is not specified, as only `raisin` triggers some warnings.
"std_025_group_full": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
),
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2.
"std_025_group_full_min_jobs_2": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
min_jobs_per_group=2,
),
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters,
# and min jobs set to 2 for only `raisin`.
# No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster.
"std_025_group_full_min_jobs_raisin": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
min_jobs_per_group={"raisin": 3},
),
}


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize("params", PARAMS.values(), ids=PARAMS.keys())
def test_check_prometheus_stats_for_gpu_jobs(
params, monkeypatch, caplog, file_regression
):
monkeypatch.setattr(
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries
)

for job in get_jobs():
job.statistics(save=True)
check_prometheus_stats_for_gpu_jobs(**params)
file_regression.check(
re.sub(
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +",
"",
caplog.text,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[invisible-cluster] no Prometheus data available: no job found
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
Loading

0 comments on commit d47872f

Please sign in to comment.