Skip to content

Commit

Permalink
Merge pull request #208 from notoraptor/cw-556-cluster-state-per-day
Browse files Browse the repository at this point in the history
[cw-556] add a script to generate cluster status
soline-b authored Nov 28, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents a918935 + 0e9eeaa commit faf9e42
Showing 3 changed files with 113 additions and 36 deletions.
55 changes: 24 additions & 31 deletions clockwork_web/core/users_helper.py
Original file line number Diff line number Diff line change
@@ -581,7 +581,12 @@ def render_template_with_user_settings(template_name_or_list, **context):
context["web_settings_json_str"] = json.dumps(context["web_settings"])

# Send the clusters infos to the template
context["clusters"] = get_all_clusters()
# NB: get_all_clusters() seems to return the clusters dict itself
# from config, not a copy. So, any modification on clusters dict
# returned by this function will be propagated into config.
# As we don't want this behaviour here, we will make a copy
# of each cluster dict.
context["clusters"] = {k: v.copy() for k, v in get_all_clusters().items()}
# List clusters available for connected user,
# or set an empty list for anon user.
context["user_clusters"] = (
@@ -591,40 +596,28 @@ def render_template_with_user_settings(template_name_or_list, **context):
)

# Get cluster status (if jobs are old and cluster has error).
"""
for cluster_name in context["clusters"]:
# Cluster error cannot yet be checked, so
# cluster_has_error is always False for now.
cluster_has_error = False
context["clusters"][cluster_name]["status"] = {
"jobs_are_old": _jobs_are_old(cluster_name),
"cluster_has_error": cluster_has_error,
}
"""
context["clusters"][cluster_name]["status"] = _get_cluster_status(cluster_name)

return render_template(template_name_or_list, **context)


def _jobs_are_old(cluster_name):
"""Return True if last slurm update in given cluster is older than 2 days."""
jobs_are_old = False
def _get_cluster_status(cluster_name):
"""
Get cluster status from DB collection `cluster_status`.
mongodb_filter = {"slurm.cluster_name": cluster_name}
Collection should be updated from an independent script
(`scripts/update_clusters_status.py`) regularly.
"""
mc = get_db()
job_with_max_cw_last_slurm_update = list(
mc["jobs"].find(mongodb_filter).sort([("cw.last_slurm_update", -1)]).limit(1)
)

if job_with_max_cw_last_slurm_update:
(job,) = job_with_max_cw_last_slurm_update
if "last_slurm_update" in job["cw"]:
most_recent_job_edition = job["cw"]["last_slurm_update"]
current_timestamp = datetime.now().timestamp()
elapsed_time = timedelta(
seconds=current_timestamp - most_recent_job_edition
)
# Let's say the latest jobs edition must not be older than max_delay.
max_delay = timedelta(days=2)
jobs_are_old = elapsed_time > max_delay

return jobs_are_old
statuses = list(mc["cluster_status"].find({"cluster_name": cluster_name}))
if statuses:
# Status found
(status,) = statuses
return status
else:
# No status found, return default values
return {
"jobs_are_old": False,
"cluster_has_error": False,
}
10 changes: 5 additions & 5 deletions clockwork_web/templates/base.html
Original file line number Diff line number Diff line change
@@ -232,17 +232,17 @@ <h1><a data-bs-toggle="collapse" data-bs-target=".formCollapse" aria-expanded="f
<i class="fa-solid fa-file-lines" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Grafana cluster link') }}"></i>
</a>
<!-- cluster status -->
<!--
{# if D_cluster['status']['jobs_are_old'] #}
{% if D_cluster['status']['jobs_are_old'] %}
<span class="cluster-info cluster-warning">
<i class="fa-solid fa-triangle-exclamation" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Possible stale jobs. Most recent update was more than 30 days ago') }}"></i>
</span>
{# else #}

{% else %}
<span class="cluster-info cluster-good">
<i class="fa-solid fa-ballot-check" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Latest update to jobs is relatively recent (at most 30 days ago)') }}"></i>
</span>
{# endif #}
{% endif %}
<!-- `cluster_has_error` not yet updated, thus still not displayed
{# if D_cluster['status']['cluster_has_error'] #}
<span class="cluster-info cluster-error">
<i class="fa-solid fa-octagon-exclamation" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('A cluster error occurred recently') }}"></i>
84 changes: 84 additions & 0 deletions scripts/update_clusters_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from datetime import datetime, timedelta

from clockwork_web.config import register_config
from clockwork_web.core.clusters_helper import get_all_clusters
from slurm_state.mongo_client import get_mongo_client
from slurm_state.config import get_config


def main():
# Register the elements to access the database
register_config("mongo.connection_string", "")
register_config("mongo.database_name", "clockwork")

# Get database and collection objects
client = get_mongo_client()
db_insertion_point = client[get_config("mongo.database_name")]
collection_name = "cluster_status"
collection = db_insertion_point[collection_name]

# Get clusters
clusters = get_all_clusters()

# Generate clusters statuses
cluster_to_status = []
for cluster_name in clusters:
# Cluster error cannot yet be checked, so
# cluster_has_error is always False for now.
cluster_has_error = False
cluster_to_status.append(
{
"cluster_name": cluster_name,
"jobs_are_old": _jobs_are_old(db_insertion_point, cluster_name),
"cluster_has_error": cluster_has_error,
}
)

# Create collection index if necessary
if not list(collection.list_indexes()):
print("Create index for collection:", collection_name)
collection.create_index(
[
("cluster_name", 1),
("jobs_are_old", 1),
("cluster_has_error", 1),
],
name="cluster_status_index",
)
# Save clusters statuses in database
for cluster_status in cluster_to_status:
collection.update_one(
{"cluster_name": cluster_status["cluster_name"]},
{"$set": cluster_status},
upsert=True,
)

print("Updated.")


def _jobs_are_old(mc, cluster_name):
"""Return True if last slurm update in given cluster is older than 2 days."""
jobs_are_old = False

mongodb_filter = {"slurm.cluster_name": cluster_name}
job_with_max_cw_last_slurm_update = list(
mc["jobs"].find(mongodb_filter).sort([("cw.last_slurm_update", -1)]).limit(1)
)

if job_with_max_cw_last_slurm_update:
(job,) = job_with_max_cw_last_slurm_update
if "last_slurm_update" in job["cw"]:
most_recent_job_edition = job["cw"]["last_slurm_update"]
current_timestamp = datetime.now().timestamp()
elapsed_time = timedelta(
seconds=current_timestamp - most_recent_job_edition
)
# Let's say the latest jobs edition must not be older than max_delay.
max_delay = timedelta(days=2)
jobs_are_old = elapsed_time > max_delay

return jobs_are_old


if __name__ == "__main__":
main()

0 comments on commit faf9e42

Please sign in to comment.