Skip to content

Commit

Permalink
Update as per feedback, this commit gives slurm user permission to run
Browse files Browse the repository at this point in the history
scripts as root
  • Loading branch information
harshthakkar01 committed May 8, 2024
1 parent 9a2db84 commit 24b5d84
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 42 deletions.
21 changes: 21 additions & 0 deletions ansible/roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@
- '{{slurm_paths.state}}'
- '{{slurm_paths.run}}'

- name: Mkdir for slurm bash scripts
file:
path: '{{item}}'
state: directory
owner: '{{slurm_user.user}}'
group: '{{slurm_user.group}}'
mode: '0700'
loop:
- '{{slurm_paths.scripts}}/bash_scripts'

- name: Include Install Tasks
include_tasks: install.yml

Expand Down Expand Up @@ -83,6 +93,17 @@
mode: '0644'
directory_mode: '0755'

- name: Copy bash scripts
copy:
src: scripts/bash_scripts/{{item}}
dest: '{{ slurm_paths.scripts }}/bash_scripts/{{item}}'
owner: "root"
group: "root"
mode: '0700'
with_items:
- run_setup_network_storage.sh
- run_setup_nfs_exports.sh

- name: Copy Jobs
copy:
src: jobs/
Expand Down
3 changes: 3 additions & 0 deletions scripts/bash_scripts/run_setup_network_storage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -c 'import setup_network_storage; setup_network_storage.setup_network_storage()'
3 changes: 3 additions & 0 deletions scripts/bash_scripts/run_setup_nfs_exports.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -c 'import setup_network_storage; setup_network_storage.setup_nfs_exports()'
28 changes: 20 additions & 8 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@
login_nodeset,
)
import slurmsync
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)


SETUP_SCRIPT = Path(__file__)
Expand Down Expand Up @@ -287,11 +283,27 @@ def setup_nss_slurm():
run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf")


def call_setup_network_storage():
result = subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"]
)
return result.returncode


def call_setup_nfs_exports():
result = subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"]
)
return result.returncode


def setup_sudoers():
content = """
# Allow SlurmUser to manage the slurm daemons
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_network_storage.sh
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_nfs_exports.sh
"""
sudoers_file = Path("/etc/sudoers.d/slurm")
sudoers_file.write_text(content)
Expand Down Expand Up @@ -399,7 +411,7 @@ def setup_controller(args):

if cfg.controller_secondary_disk:
setup_secondary_disks()
setup_network_storage(log)
call_setup_network_storage()

run_custom_scripts()

Expand Down Expand Up @@ -431,7 +443,7 @@ def setup_controller(args):
run("systemctl enable nfs-server", timeout=30)
run("systemctl start nfs-server", timeout=30)

setup_nfs_exports()
call_setup_nfs_exports()
run("systemctl enable --now slurmcmd.timer", timeout=30)

log.info("Check status of cluster services")
Expand Down Expand Up @@ -464,7 +476,7 @@ def setup_login(args):
update_system_config("slurmd", sysconf)
install_custom_scripts()

setup_network_storage(log)
call_setup_network_storage()
setup_sudoers()
run("systemctl restart munge")
run("systemctl enable slurmd", timeout=30)
Expand Down Expand Up @@ -498,7 +510,7 @@ def setup_compute(args):
install_custom_scripts()

setup_nss_slurm()
setup_network_storage(log)
call_setup_network_storage()

has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode
if has_gpu:
Expand Down
56 changes: 29 additions & 27 deletions scripts/setup_network_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ def setup_nfs_exports():
run("exportfs -a", timeout=30)


def munge_mount_handler(log):
def munge_mount_handler():
cfg = load_config_file(Path(__file__).with_name("config.yaml"))
if not cfg.munge_mount:
log.error("Missing munge_mount in cfg")
elif lkp.instance_role == "controller":
# if not cfg.munge_mount:
# # log.error("Missing munge_mount in cfg")
if lkp.instance_role == "controller":
return

mount = cfg.munge_mount
Expand All @@ -102,7 +102,7 @@ def munge_mount_handler(log):

munge_key = Path(dirs.munge / "munge.key")

log.info(f"Mounting munge share to: {local_mount}")
# log.info(f"Mounting munge share to: {local_mount}")
local_mount.mkdir()
if fs_type.lower() == "gcsfuse".lower():
if remote_mount is None:
Expand Down Expand Up @@ -130,43 +130,43 @@ def munge_mount_handler(log):
run(cmd, timeout=timeout)
break
except Exception as e:
log.error(
f"munge mount failed: '{cmd}' {e}, try {retry}, waiting {wait:0.2f}s"
)
# log.error(
# f"munge mount failed: '{cmd}' {e}, try {retry}, waiting {wait:0.2f}s"
# )
time.sleep(wait)
err = e
continue
else:
raise err

log.info(f"Copy munge.key from: {local_mount}")
# log.info(f"Copy munge.key from: {local_mount}")
shutil.copy2(Path(local_mount / "munge.key"), munge_key)

log.info("Restrict permissions of munge.key")
# log.info("Restrict permissions of munge.key")
shutil.chown(munge_key, user="munge", group="munge")
os.chmod(munge_key, stat.S_IRUSR)

log.info(f"Unmount {local_mount}")
# log.info(f"Unmount {local_mount}")
if fs_type.lower() == "gcsfuse".lower():
run(f"fusermount -u {local_mount}", timeout=120)
else:
run(f"umount {local_mount}", timeout=120)
shutil.rmtree(local_mount)


def mount_fstab(mounts, log):
def mount_fstab(mounts):
"""Wait on each mount, then make sure all fstab is mounted"""
from more_executors import Executors, ExceptionRetryPolicy

def mount_path(path):
log.info(f"Waiting for '{path}' to be mounted...")
# log.info(f"Waiting for '{path}' to be mounted...")
try:
run(f"mount {path}", timeout=120)
except Exception as e:
exc_type, _, _ = sys.exc_info()
log.error(f"mount of path '{path}' failed: {exc_type}: {e}")
# log.error(f"mount of path '{path}' failed: {exc_type}: {e}")
raise e
log.info(f"Mount point '{path}' was mounted.")
# log.info(f"Mount point '{path}' was mounted.")

MAX_MOUNT_TIMEOUT = 60 * 5
future_list = []
Expand Down Expand Up @@ -250,10 +250,11 @@ def resolve_network_storage(nodeset=None):
return list(mounts.values())


def setup_network_storage(log):
def setup_network_storage():
"""prepare network fs mounts and add them to fstab"""
log.info("Set up network storage")
# log.info("Set up network storage")
# filter mounts into two dicts, cluster-internal and external mounts
print("INFO: set up network storage called")
all_mounts = resolve_network_storage()
ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts)
if lkp.instance_role == "controller":
Expand All @@ -269,14 +270,14 @@ def setup_network_storage(log):
server_ip = mount.server_ip or ""
local_mount.mkdirp()

log.info(
"Setting up mount ({}) {}{} to {}".format(
fs_type,
server_ip + ":" if fs_type != "gcsfuse" else "",
remote_mount,
local_mount,
)
)
# log.info(
# "Setting up mount ({}) {}{} to {}".format(
# fs_type,
# server_ip + ":" if fs_type != "gcsfuse" else "",
# remote_mount,
# local_mount,
# )
# )

mount_options = mount.mount_options.split(",") if mount.mount_options else []
if not mount_options or "_netdev" not in mount_options:
Expand Down Expand Up @@ -308,5 +309,6 @@ def setup_network_storage(log):
for entry in fstab_entries:
f.write(entry)
f.write("\n")
mount_fstab(mounts_by_local(mounts), log)
munge_mount_handler(log)
mount_fstab(mounts_by_local(mounts))
munge_mount_handler()
print("INFO: set up network storage finished")
25 changes: 18 additions & 7 deletions scripts/slurmsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from itertools import chain
from pathlib import Path
import yaml
import subprocess

import util
from util import (
Expand Down Expand Up @@ -59,10 +60,6 @@
install_cgroup_conf,
install_topology_conf,
)
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)

filename = Path(__file__).name
LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log")
Expand Down Expand Up @@ -97,6 +94,20 @@ def start_instance_op(inst, project=None):
)


def call_setup_network_storage():
result = subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"]
)
return result.returncode


def call_setup_nfs_exports():
result = subprocess.run(
["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"]
)
return result.returncode


def start_instances(node_list):
log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list)))

Expand Down Expand Up @@ -486,8 +497,8 @@ def reconfigure_slurm():
install_gres_conf(lkp)
install_cgroup_conf(lkp)
install_topology_conf(lkp)
setup_network_storage(log)
setup_nfs_exports()
call_setup_network_storage()
call_setup_nfs_exports()
log.info("Restarting slurmctld to make changes take effect.")
try:
run("sudo systemctl restart slurmctld.service", check=False)
Expand All @@ -497,7 +508,7 @@ def reconfigure_slurm():
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lkp.instance_role_safe in ["compute", "login"]:
setup_network_storage(log)
call_setup_network_storage()
log.info("Restarting slurmd to make changes take effect.")
run("systemctl restart slurmd")
util.run(f"wall '{update_msg}'", timeout=30)
Expand Down

0 comments on commit 24b5d84

Please sign in to comment.