Skip to content

Commit

Permalink
Update as per feedback, this commit gives slurm user permission to run
Browse files Browse the repository at this point in the history
scripts as root
  • Loading branch information
harshthakkar01 committed May 8, 2024
1 parent 9a2db84 commit 5e19f9d
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 20 deletions.
21 changes: 21 additions & 0 deletions ansible/roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@
- '{{slurm_paths.state}}'
- '{{slurm_paths.run}}'

- name: Mkdir for slurm bash scripts
file:
path: '{{item}}'
state: directory
owner: '{{slurm_user.user}}'
group: '{{slurm_user.group}}'
mode: '0700'
loop:
- '{{slurm_paths.scripts}}/bash_scripts'

- name: Include Install Tasks
include_tasks: install.yml

Expand Down Expand Up @@ -83,6 +93,17 @@
mode: '0644'
directory_mode: '0755'

- name: Copy bash scripts
copy:
src: scripts/bash_scripts/{{item}}
dest: '{{ slurm_paths.scripts }}/bash_scripts/{{item}}'
owner: "root"
group: "root"
mode: '0700'
with_items:
- run_setup_network_storage.sh
- run_setup_nfs_exports.sh

- name: Copy Jobs
copy:
src: jobs/
Expand Down
3 changes: 3 additions & 0 deletions scripts/bash_scripts/run_setup_network_storage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_network_storage"
3 changes: 3 additions & 0 deletions scripts/bash_scripts/run_setup_nfs_exports.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_nfs_exports"
32 changes: 24 additions & 8 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@
login_nodeset,
)
import slurmsync
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)


SETUP_SCRIPT = Path(__file__)
Expand Down Expand Up @@ -287,11 +283,31 @@ def setup_nss_slurm():
run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf")


def call_setup_network_storage(log):
try:
subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"]
)
log.info("network storage mounted successfully")
except Exception as e:
log.error(e)


def call_setup_nfs_exports(log):
try:
subprocess.run(["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"])
log.info("nfs exported successfully")
except Exception as e:
log.error(e)


def setup_sudoers():
content = """
# Allow SlurmUser to manage the slurm daemons
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_network_storage.sh
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_nfs_exports.sh
"""
sudoers_file = Path("/etc/sudoers.d/slurm")
sudoers_file.write_text(content)
Expand Down Expand Up @@ -399,7 +415,7 @@ def setup_controller(args):

if cfg.controller_secondary_disk:
setup_secondary_disks()
setup_network_storage(log)
call_setup_network_storage(log)

run_custom_scripts()

Expand Down Expand Up @@ -431,7 +447,7 @@ def setup_controller(args):
run("systemctl enable nfs-server", timeout=30)
run("systemctl start nfs-server", timeout=30)

setup_nfs_exports()
call_setup_nfs_exports()
run("systemctl enable --now slurmcmd.timer", timeout=30)

log.info("Check status of cluster services")
Expand Down Expand Up @@ -464,7 +480,7 @@ def setup_login(args):
update_system_config("slurmd", sysconf)
install_custom_scripts()

setup_network_storage(log)
call_setup_network_storage(log)
setup_sudoers()
run("systemctl restart munge")
run("systemctl enable slurmd", timeout=30)
Expand Down Expand Up @@ -498,7 +514,7 @@ def setup_compute(args):
install_custom_scripts()

setup_nss_slurm()
setup_network_storage(log)
call_setup_network_storage(log)

has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode
if has_gpu:
Expand Down
35 changes: 30 additions & 5 deletions scripts/setup_network_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,31 @@
import time

import shutil
import logging
from pathlib import Path
from concurrent.futures import as_completed
from addict import Dict as NSDict

from util import (
cfg,
lkp,
dirs,
separate,
run,
host_lookup,
chown_slurm,
config_root_logger,
load_config_file,
backoff_delay,
)


filename = Path(__file__).name
LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log")

log = logging.getLogger(filename)


def setup_nfs_exports():
"""nfs export all needed directories"""
# The controller only needs to set up exports for cluster-internal mounts
Expand Down Expand Up @@ -78,7 +88,7 @@ def setup_nfs_exports():
run("exportfs -a", timeout=30)


def munge_mount_handler(log):
def munge_mount_handler():
cfg = load_config_file(Path(__file__).with_name("config.yaml"))
if not cfg.munge_mount:
log.error("Missing munge_mount in cfg")
Expand Down Expand Up @@ -154,7 +164,7 @@ def munge_mount_handler(log):
shutil.rmtree(local_mount)


def mount_fstab(mounts, log):
def mount_fstab(mounts):
"""Wait on each mount, then make sure all fstab is mounted"""
from more_executors import Executors, ExceptionRetryPolicy

Expand Down Expand Up @@ -250,10 +260,11 @@ def resolve_network_storage(nodeset=None):
return list(mounts.values())


def setup_network_storage(log):
def setup_network_storage():
"""prepare network fs mounts and add them to fstab"""
log.info("Set up network storage")
# filter mounts into two dicts, cluster-internal and external mounts

all_mounts = resolve_network_storage()
ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts)
if lkp.instance_role == "controller":
Expand Down Expand Up @@ -308,5 +319,19 @@ def setup_network_storage(log):
for entry in fstab_entries:
f.write(entry)
f.write("\n")
mount_fstab(mounts_by_local(mounts), log)
munge_mount_handler(log)
mount_fstab(mounts_by_local(mounts))
munge_mount_handler()


if __name__ == "__main__":
chown_slurm(LOGFILE, mode=0o600)
config_root_logger(filename, logfile=LOGFILE)

if len(sys.argv) != 2:
log.error("only 2 argument needed..")
else:
function_name = sys.argv[1]
if function_name == "setup_network_storage":
setup_network_storage()
else:
setup_nfs_exports()
29 changes: 22 additions & 7 deletions scripts/slurmsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from itertools import chain
from pathlib import Path
import yaml
import subprocess

import util
from util import (
Expand Down Expand Up @@ -59,10 +60,6 @@
install_cgroup_conf,
install_topology_conf,
)
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)

filename = Path(__file__).name
LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log")
Expand Down Expand Up @@ -97,6 +94,24 @@ def start_instance_op(inst, project=None):
)


def call_setup_network_storage(log):
try:
subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"]
)
log.info("network storage mounted successfully")
except Exception as e:
log.error(e)


def call_setup_nfs_exports(log):
try:
subprocess.run(["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"])
log.info("nfs exported successfully")
except Exception as e:
log.error(e)


def start_instances(node_list):
log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list)))

Expand Down Expand Up @@ -486,8 +501,8 @@ def reconfigure_slurm():
install_gres_conf(lkp)
install_cgroup_conf(lkp)
install_topology_conf(lkp)
setup_network_storage(log)
setup_nfs_exports()
call_setup_network_storage(log)
call_setup_nfs_exports(log)
log.info("Restarting slurmctld to make changes take effect.")
try:
run("sudo systemctl restart slurmctld.service", check=False)
Expand All @@ -497,7 +512,7 @@ def reconfigure_slurm():
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lkp.instance_role_safe in ["compute", "login"]:
setup_network_storage(log)
call_setup_network_storage(log)
log.info("Restarting slurmd to make changes take effect.")
run("systemctl restart slurmd")
util.run(f"wall '{update_msg}'", timeout=30)
Expand Down

0 comments on commit 5e19f9d

Please sign in to comment.