Skip to content

Commit

Permalink
Update as per feedback, this commit gives slurm user permission to run
Browse files Browse the repository at this point in the history
scripts as root
  • Loading branch information
harshthakkar01 committed May 9, 2024
1 parent 9a2db84 commit 9f9aa2c
Show file tree
Hide file tree
Showing 7 changed files with 140 additions and 21 deletions.
11 changes: 10 additions & 1 deletion ansible/roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,22 @@
- conf.py
- resume.py
- setup.py
- setup_network_storage.py
- startup.sh
- slurmsync.py
- suspend.py
- util.py
- load_bq.py

- name: Copy network storage scripts with root owner
copy:
src: scripts/{{item}}
dest: '{{slurm_paths.scripts}}/{{item}}'
owner: 'root'
group: 'root'
mode: 0o755
with_items:
- setup_network_storage.py

- name: Copy slurm_gcp_plugins
copy:
src: scripts/slurm_gcp_plugins
Expand Down
3 changes: 3 additions & 0 deletions scripts/network_storage_wrappers/run_setup_network_storage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_network_storage"
3 changes: 3 additions & 0 deletions scripts/network_storage_wrappers/run_setup_nfs_exports.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_nfs_exports"
34 changes: 26 additions & 8 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@
login_nodeset,
)
import slurmsync
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)


SETUP_SCRIPT = Path(__file__)
Expand Down Expand Up @@ -287,11 +283,33 @@ def setup_nss_slurm():
run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf")


def call_setup_network_storage(log):
try:
subprocess.run(
"/slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh"
)
log.info("network storage mounted successfully")
except Exception as e:
log.error(e)


def call_setup_nfs_exports(log):
try:
subprocess.run(
"/slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh"
)
log.info("nfs exported successfully")
except Exception as e:
log.error(e)


def setup_sudoers():
content = """
# Allow SlurmUser to manage the slurm daemons
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh
slurm ALL=(ALL) NOPASSWD: /slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh
"""
sudoers_file = Path("/etc/sudoers.d/slurm")
sudoers_file.write_text(content)
Expand Down Expand Up @@ -399,7 +417,7 @@ def setup_controller(args):

if cfg.controller_secondary_disk:
setup_secondary_disks()
setup_network_storage(log)
call_setup_network_storage(log)

run_custom_scripts()

Expand Down Expand Up @@ -431,7 +449,7 @@ def setup_controller(args):
run("systemctl enable nfs-server", timeout=30)
run("systemctl start nfs-server", timeout=30)

setup_nfs_exports()
call_setup_nfs_exports(log)
run("systemctl enable --now slurmcmd.timer", timeout=30)

log.info("Check status of cluster services")
Expand Down Expand Up @@ -464,7 +482,7 @@ def setup_login(args):
update_system_config("slurmd", sysconf)
install_custom_scripts()

setup_network_storage(log)
call_setup_network_storage(log)
setup_sudoers()
run("systemctl restart munge")
run("systemctl enable slurmd", timeout=30)
Expand Down Expand Up @@ -498,7 +516,7 @@ def setup_compute(args):
install_custom_scripts()

setup_nss_slurm()
setup_network_storage(log)
call_setup_network_storage(log)

has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode
if has_gpu:
Expand Down
35 changes: 30 additions & 5 deletions scripts/setup_network_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,31 @@
import time

import shutil
import logging
from pathlib import Path
from concurrent.futures import as_completed
from addict import Dict as NSDict

from util import (
cfg,
lkp,
dirs,
separate,
run,
host_lookup,
chown_slurm,
config_root_logger,
load_config_file,
backoff_delay,
)


filename = Path(__file__).name
LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log")

log = logging.getLogger(filename)


def setup_nfs_exports():
"""nfs export all needed directories"""
# The controller only needs to set up exports for cluster-internal mounts
Expand Down Expand Up @@ -78,7 +88,7 @@ def setup_nfs_exports():
run("exportfs -a", timeout=30)


def munge_mount_handler(log):
def munge_mount_handler():
cfg = load_config_file(Path(__file__).with_name("config.yaml"))
if not cfg.munge_mount:
log.error("Missing munge_mount in cfg")
Expand Down Expand Up @@ -154,7 +164,7 @@ def munge_mount_handler(log):
shutil.rmtree(local_mount)


def mount_fstab(mounts, log):
def mount_fstab(mounts):
"""Wait on each mount, then make sure all fstab is mounted"""
from more_executors import Executors, ExceptionRetryPolicy

Expand Down Expand Up @@ -250,10 +260,11 @@ def resolve_network_storage(nodeset=None):
return list(mounts.values())


def setup_network_storage(log):
def setup_network_storage():
"""prepare network fs mounts and add them to fstab"""
log.info("Set up network storage")
# filter mounts into two dicts, cluster-internal and external mounts

all_mounts = resolve_network_storage()
ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts)
if lkp.instance_role == "controller":
Expand Down Expand Up @@ -308,5 +319,19 @@ def setup_network_storage(log):
for entry in fstab_entries:
f.write(entry)
f.write("\n")
mount_fstab(mounts_by_local(mounts), log)
munge_mount_handler(log)
mount_fstab(mounts_by_local(mounts))
munge_mount_handler()


if __name__ == "__main__":
chown_slurm(LOGFILE, mode=0o600)
config_root_logger(filename, logfile=LOGFILE)

if len(sys.argv) != 2:
log.error("only 2 argument needed..")
else:
function_name = sys.argv[1]
if function_name == "setup_network_storage":
setup_network_storage()
else:
setup_nfs_exports()
29 changes: 22 additions & 7 deletions scripts/slurmsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from itertools import chain
from pathlib import Path
import yaml
import subprocess

import util
from util import (
Expand Down Expand Up @@ -59,10 +60,6 @@
install_cgroup_conf,
install_topology_conf,
)
from setup_network_storage import (
setup_network_storage,
setup_nfs_exports,
)

filename = Path(__file__).name
LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log")
Expand Down Expand Up @@ -97,6 +94,24 @@ def start_instance_op(inst, project=None):
)


def call_setup_network_storage(log):
try:
subprocess.run(
["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh"]
)
log.info("network storage mounted successfully")
except Exception as e:
log.error(e)


def call_setup_nfs_exports(log):
try:
subprocess.run(["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh"])
log.info("nfs exported successfully")
except Exception as e:
log.error(e)


def start_instances(node_list):
log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list)))

Expand Down Expand Up @@ -486,8 +501,8 @@ def reconfigure_slurm():
install_gres_conf(lkp)
install_cgroup_conf(lkp)
install_topology_conf(lkp)
setup_network_storage(log)
setup_nfs_exports()
call_setup_network_storage(log)
call_setup_nfs_exports(log)
log.info("Restarting slurmctld to make changes take effect.")
try:
run("sudo systemctl restart slurmctld.service", check=False)
Expand All @@ -497,7 +512,7 @@ def reconfigure_slurm():
util.run(f"wall '{update_msg}'", timeout=30)
log.debug("Done.")
elif lkp.instance_role_safe in ["compute", "login"]:
setup_network_storage(log)
call_setup_network_storage(log)
log.info("Restarting slurmd to make changes take effect.")
run("systemctl restart slurmd")
util.run(f"wall '{update_msg}'", timeout=30)
Expand Down
46 changes: 46 additions & 0 deletions scripts/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@

set -e

SLURM_USER=slurm
SLURM_DIR=/slurm
FLAGFILE=$SLURM_DIR/slurm_configured_do_not_remove
SCRIPTS_DIR=$SLURM_DIR/scripts
STORAGE_WRAPPER_DIR=$SCRIPTS_DIR/network_storage_wrappers

if [[ -z "$HOME" ]]; then
# google-startup-scripts.service lacks environment variables
HOME="$(getent passwd "$(whoami)" | cut -d: -f6)"
Expand Down Expand Up @@ -162,6 +165,49 @@ function fetch_feature {
}
SLURMD_FEATURE="$(fetch_feature)"


function add_network_storage_wrappers {
echo "INFO: Setting up network storage wrappers"

# Check if the target directory exists
if [[ ! -d "$STORAGE_WRAPPER_DIR" ]]; then
# Create the directory
mkdir -p "$STORAGE_WRAPPER_DIR"

# Set permissions (owner-->slurm, group-->slurm, mode-->0755)
chown $SLURM_USER:$SLURM_USER $STORAGE_WRAPPER_DIR
chmod 0755 $STORAGE_WRAPPER_DIR

echo "INFO: Directory created: ${STORAGE_WRAPPER_DIR}"
else
echo "INFO: Directory already exists: ${STORAGE_WRAPPER_DIR}"
fi

# Wrapper for setup-network-storage function
cat << EOF > "${STORAGE_WRAPPER_DIR}/run_setup_network_storage.sh"
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_network_storage"
EOF

# Wrapper for setup-nfs-exports function
cat << EOF > "${STORAGE_WRAPPER_DIR}/run_setup_nfs_exports.sh"
#!/bin/bash
cd /slurm/scripts/
python3 -m setup_network_storage "setup_nfs_exports"
EOF

# Set permissions (owner-->root, group-->root, mode-->0744)
chown root:root "${STORAGE_WRAPPER_DIR}/run_setup_network_storage.sh" "${STORAGE_WRAPPER_DIR}/run_setup_nfs_exports.sh"
chmod 0744 "${STORAGE_WRAPPER_DIR}/run_setup_network_storage.sh" "${STORAGE_WRAPPER_DIR}/run_setup_nfs_exports.sh"

echo "INFO: network storage wrappers created"
}

# Add network storage wrappers to be used by setup and slurm sync scripts to
# mount network storage.
add_network_storage_wrappers

echo "INFO: Running python cluster setup script"
chmod +x $SETUP_SCRIPT_FILE
python3 $SCRIPTS_DIR/util.py
Expand Down

0 comments on commit 9f9aa2c

Please sign in to comment.