Skip to content

Commit

Permalink
Merge pull request #7 from vub-hpc/multi-cluster
Browse files Browse the repository at this point in the history
add support for multi-cluster setups in Slurm
  • Loading branch information
wpoely86 authored Jun 20, 2024
2 parents ef29dfb + d548dc2 commit e20fe49
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 34 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Fork of the original [silx-kit/jupyterhub_moss](https://github.com/silx-kit/jupy
maintained by VUB-HPC.

Notable changes in this fork:
* support for multi-cluster setups in Slurm
* support different groups of default environment
* display available resources as job slots per amount of cores

Expand Down
2 changes: 1 addition & 1 deletion jupyterhub_moss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .spawner import MOSlurmSpawner
from .utils import local_path as _local_path

version = "7.0.1.5"
version = "7.5.0"

STATIC_FORM_REGEX = r"/form/(.*)"
STATIC_FORM_PATH = _local_path("form")
Expand Down
1 change: 1 addition & 0 deletions jupyterhub_moss/batch_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
{% endif %}{% if gres %}#SBATCH --gres={{gres}}
{% endif %}{% if nprocs %}#SBATCH --cpus-per-task={{nprocs}}
{% endif %}{% if reservation%}#SBATCH --reservation={{reservation}}
{% endif %}{% if clusters %}#SBATCH {{clusters}}
{% endif %}{% if options %}#SBATCH {{options}}{% endif %}

set -euo pipefail
Expand Down
9 changes: 9 additions & 0 deletions jupyterhub_moss/form/option_form.css
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,12 @@
#environment_add_button:disabled {
color: lightgray;
}

.resource_table > tbody > tr > th {
text-align: center;
width: 20%;
vertical-align: middle;
}
.resource_table > tbody > tr > td {
text-align: center;
}
2 changes: 2 additions & 0 deletions jupyterhub_moss/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class PartitionConfig(BaseModel, frozen=True, extra="forbid"):
description: str = ""
jupyter_environments: Dict[str, JupyterEnvironment]
simple: bool = True
visible_resources: bool = True


class PartitionInfo(PartitionConfig, PartitionResources):
Expand Down Expand Up @@ -153,6 +154,7 @@ class UserOptions(BaseModel):
# Extra fields
gres: str = ""
prologue: str = ""
clusters: str = ""

@classmethod
def parse_formdata(cls, formdata: dict[str, list[str]]) -> UserOptions:
Expand Down
61 changes: 40 additions & 21 deletions jupyterhub_moss/spawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
file_hash,
local_path,
parse_gpu_resource,
parse_partition_id,
parse_timelimit,
)

Expand Down Expand Up @@ -83,7 +84,7 @@ def _validate_partitions(self, proposal: dict) -> dict[str, dict]:

slurm_info_cmd = traitlets.Unicode(
# Get number of nodes/state, cores/node, cores/state, gpus, total memory for all partitions
r"sinfo -N -a --noheader -O 'PartitionName,StateCompact,CPUsState,Gres,GresUsed,Memory,Time'",
r"sinfo -N -p {partition} {clusters} --noheader -O 'Cluster,PartitionName,StateCompact,CPUsState,Gres:64,GresUsed:64,Memory,Time'",
help="Command to query cluster information from Slurm. Formatted using req_xyz traits as {xyz}."
"Output will be parsed by ``slurm_info_resources``.",
).tag(config=True)
Expand Down Expand Up @@ -120,6 +121,7 @@ def _slurm_info_resources(

for line in slurm_info_out.splitlines():
(
cluster,
partition,
node_state,
ncores,
Expand All @@ -134,6 +136,10 @@ def _slurm_info_resources(
if node_state not in ["idle", "mix", "alloc"]:
continue

# unique reference from cluster and partition names
if cluster != 'N/A':
partition = f"{cluster}.{partition}"

# core count - allocated/idle/other/total
_, ncores_idle, _, ncores_total = ncores.split("/")
ncores_idle = int(ncores_idle)
Expand Down Expand Up @@ -228,33 +234,38 @@ async def _get_partitions_info(self) -> dict[str, PartitionInfo]:
2. Parses output with slurm_info_resources
3. Combines info with partitions traitlet
"""
# Execute given slurm info command
subvars = self.get_req_subvars()
cmd = " ".join(
(
format_template(self.exec_prefix, **subvars),
format_template(self.slurm_info_cmd, **subvars),
partitions = PartitionsTrait.model_validate(self.partitions)
partitions_info = {}

# Execute given slurm info command on each partition
for partition_id, config_partition_info in partitions.items():
subvars = self.get_req_subvars()
subvars["partition"], subvars["clusters"] = parse_partition_id(partition_id)
if subvars["clusters"]:
subvars["clusters"] = f"-M {subvars['clusters']}"

sinfo_cmd = " ".join(
(
format_template(self.exec_prefix, **subvars),
format_template(self.slurm_info_cmd, **subvars),
)
)
)
self.log.debug("Slurm info command: %s", cmd)
out = await self.run_command(cmd)
self.log.debug("Slurm info command for partition ID '%s': %s", partition_id, sinfo_cmd)
partition_sinfo_out = await self.run_command(sinfo_cmd)
# self.log.debug("Slurm info command output: %s", partition_sinfo_out)

# Parse command output
resources_info = self.slurm_info_resources(out)
self.log.debug("Slurm partition resources: %s", resources_info)
# Parse command output
resources_info = self.slurm_info_resources(partition_sinfo_out)
self.log.debug("Slurm partition resources: %s", resources_info)

partitions = PartitionsTrait.model_validate(self.partitions)

# use data from Slurm as base and overwrite with manual configuration settings
partitions_info = {
partition: PartitionInfo.model_validate(
# use data from Slurm as base and overwrite with manual configuration settings
partitions_info[partition_id] = PartitionInfo.model_validate(
{
**resources_info[partition],
**resources_info[partition_id],
**config_partition_info.dict(exclude_none=True),
}
)
for partition, config_partition_info in partitions.items()
}

return partitions_info

@staticmethod
Expand Down Expand Up @@ -326,6 +337,14 @@ def __update_options(
The provided `options` argument is modified in-place.
"""
# Handle multi-cluster partitions
partition_name, cluster_name = parse_partition_id(options.partition)
options.partition = partition_name
if cluster_name:
options.clusters = f"--clusters={cluster_name}"
self.req_cluster = cluster_name
self.state_exechost_exp = rf"\1.{cluster_name}.os"

# Specific handling of exclusive flag
# When memory=0 or all CPU are requested, set the exclusive flag
if options.nprocs == partition_info.max_nprocs or options.memory == "0":
Expand Down
26 changes: 14 additions & 12 deletions jupyterhub_moss/templates/option_form.html
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
{% macro resource_tab_footer(partitions, simple_only) %}
<h4 class="subheading">Available resources at this time</h4>
<table class="table">
<table class="table resource_table">
<tr class="active">
<th style="padding-right: 10rem;">Partition</th>
<th style="text-align: center; width: 25%;">1-core slots</th>
<th style="text-align: center; width: 25%;">2-cores slots</th>
<th style="text-align: center; width: 25%;">4-cores slots</th>
<th style="text-align: center; width: 25%;">GPU slots</th>
<th>Cluster Partition</th>
<th>1-core slots</th>
<th>2-cores slots</th>
<th>4-cores slots</th>
<th>GPU slots</th>
</tr>
{% for name, partition in partitions.items() %}
{% if partition.visible_resources %}
{% if partition.simple or not simple_only %}
<tr>
<th>{{ name }}</th>
<th style="text-align: left">{{ name }}</th>
{% for slot in partition['job_slots'] %}
<th style="text-align: center">{{ slot }}</th>
<td>{{ slot }}</th>
{% endfor %}
</tr>
{% endif %}
{% endif %}
{% endfor %}
</table>
{% endmacro %}
Expand All @@ -43,7 +45,7 @@ <h4 class="subheading">Available resources at this time</h4>
</ul>
<div class="tab-content">
<div id="home" class="tab-pane fade in active">
<h4 class="subheading">Partition</h4>
<h4 class="subheading">Cluster Partition</h4>
<div class="radio-toolbar">
{% for name, partition in partitions.items() %}
{% if partition.simple %}
Expand All @@ -57,8 +59,8 @@ <h4 class="subheading">Partition</h4>
{% endif %}
/>
<label for="{{ name }}">
<p>{{ partition.description }} <span class="label-extra-info">({{ partition.architecture }})</span></p>
<p class="label-extra-info">Partition: {{ name }}</p>
<p>{{ partition.description }}</p>
<p class="label-extra-info">{{ name }} ({{ partition.architecture }})</p>
</label>
{% endif %}
{% endfor %}
Expand Down Expand Up @@ -148,7 +150,7 @@ <h4 class="subheading">Options</h4>
<div id="menu1" class="tab-pane fade indent-right" align="right">
<div class="form-container">
<label for="partition" accesskey="p">
Partition <span class="label-extra-info">(--partition)</span>:
Cluster Partition:
</label>
<select
name="partition"
Expand Down
14 changes: 14 additions & 0 deletions jupyterhub_moss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,17 @@ def create_prologue(
prologue += f'\nexport PATH="{environment_path}:$PATH"'

return prologue


def parse_partition_id(partition_id: str) -> tuple[str, str]:
"""Parse partition ID string from configuration file"""
id_sections = partition_id.split('.', 1)
id_sections.reverse()
partition_name = id_sections.pop(0)

cluster_name = ''
if id_sections:
cluster_name = id_sections.pop(0)

return partition_name, cluster_name

0 comments on commit e20fe49

Please sign in to comment.