diff --git a/README.md b/README.md index bed7e57..03cc911 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ Salt formula provisioning a Slurm cluster -Availables states: - * Munge - * Screen - * Slurm - * Slurm Database - * SSH +To install Slurm nodes, you need to copy (on Slurm mater node) + +- munge.key from /etc/munge/munge.key to /srv/salt/munge.key +- slurm.cert from /etc/slurm-llnl/slurm.cert to /srv/salt/slurm.cert +- slurm.conf from files/etc/slurm-llnl/slurm.conf to /srv/salt/slurm.conf +- create empty cgroup.conf and gres.conf in /srv/salt/ diff --git a/files/etc/apt/preferences.d/disable-utopic-policy b/files/etc/apt/preferences.d/disable-utopic-policy deleted file mode 100644 index fd20508..0000000 --- a/files/etc/apt/preferences.d/disable-utopic-policy +++ /dev/null @@ -1,5 +0,0 @@ -Explanation: disable utopic packages by default -Package: * -Pin: release n=utopic -Pin-priority: -1 - diff --git a/files/etc/apt/preferences.d/slurm-utopic-policy b/files/etc/apt/preferences.d/slurm-utopic-policy deleted file mode 100644 index db54609..0000000 --- a/files/etc/apt/preferences.d/slurm-utopic-policy +++ /dev/null @@ -1,5 +0,0 @@ -Explanation: Grab Slurm from utopic -Package: libpam-slurm libpmi0 libpmi0-dev libslurm-dev libslurm-perl libslurm26 libslurmdb-dev libslurmdb-perl libslurmdb26 slurm-llnl slurm-llnl-basic-plugins slurm-llnl-basic-plugins-dev slurm-llnl-doc slurm-llnl-slurmdbd slurm-llnl-sview slurm-llnl-torque -Pin: release n=utopic -Pin-priority: 500 - diff --git a/files/etc/apt/sources.list.d/utopic.list b/files/etc/apt/sources.list.d/utopic.list deleted file mode 100644 index c7e862a..0000000 --- a/files/etc/apt/sources.list.d/utopic.list +++ /dev/null @@ -1 +0,0 @@ -deb http://archive.ubuntu.com/ubuntu utopic main restricted universe multiverse diff --git a/files/etc/apt/sources.list.d/xenial.list b/files/etc/apt/sources.list.d/xenial.list new file mode 100644 index 0000000..2c9d312 --- /dev/null +++ b/files/etc/apt/sources.list.d/xenial.list @@ -0,0 +1 @@ +deb http://archive.ubuntu.com/ubuntu xenial main restricted universe multiverse diff --git a/files/etc/default/munge b/files/etc/default/munge deleted file mode 100644 index 03556f8..0000000 --- a/files/etc/default/munge +++ /dev/null @@ -1,6 +0,0 @@ -## -# Overcome incompatibility with Ubuntu 14.04 -# https://bugs.launchpad.net/ubuntu/+source/munge/+bug/1287624 -## -OPTIONS="--force" # --key-file /etc/munge/munged.key --num-threads 1" - diff --git a/files/etc/security/access.conf b/files/etc/security/access.conf index 78bca95..8c0bd9c 100644 --- a/files/etc/security/access.conf +++ b/files/etc/security/access.conf @@ -9,4 +9,4 @@ # who wants to be able to SSH in as root via public-key on Biomedia servers. # disable SSH for anybody but root +:root:ALL --:ALL EXCEPT (csg) dr jpassera bglocker:ALL +-:ALL EXCEPT (csg) (biomedia) dr jpassera bglocker jgao:ALL diff --git a/files/etc/slurm-llnl/cgroup.conf b/files/etc/slurm-llnl/cgroup.conf index 1ba06c1..4ed58d9 100644 --- a/files/etc/slurm-llnl/cgroup.conf +++ b/files/etc/slurm-llnl/cgroup.conf @@ -16,7 +16,8 @@ CgroupReleaseAgentDir=/var/spool/slurm-llnl/cgroup ConstrainCores=yes TaskAffinity=yes #ConstrainRAMSpace=no -### not used yet -#ConstrainDevices=no -#AllowedDevicesFile=/etc/slurm-llnl/cgroup_allowed_devices_file.conf - +ConstrainSwapSpace=yes +AllowedSwapSpace=10.0 +# Not well supported until Slurm v14.11.4 https://groups.google.com/d/msg/slurm-devel/oKAUed7AETs/Eb6thh9Lc0YJ +#ConstrainDevices=yes +#AllowedDevicesFile=/etc/slurm-llnl/cgroup_allowed_devices_file.conf \ No newline at end of file diff --git a/files/etc/slurm-llnl/gres.conf b/files/etc/slurm-llnl/gres.conf new file mode 100644 index 0000000..e69de29 diff --git a/files/etc/slurm-llnl/monal01/gres.conf b/files/etc/slurm-llnl/gres.conf_gpu similarity index 100% rename from files/etc/slurm-llnl/monal01/gres.conf rename to files/etc/slurm-llnl/gres.conf_gpu diff --git a/files/etc/slurm-llnl/bardolph/gres.conf b/files/etc/slurm-llnl/gres.conf_gpu_l similarity index 50% rename from files/etc/slurm-llnl/bardolph/gres.conf rename to files/etc/slurm-llnl/gres.conf_gpu_l index 6950eb2..0acc951 100644 --- a/files/etc/slurm-llnl/bardolph/gres.conf +++ b/files/etc/slurm-llnl/gres.conf_gpu_l @@ -1,2 +1,4 @@ Name=gpu File=/dev/nvidia0 Name=gpu File=/dev/nvidia1 +Name=gpu File=/dev/nvidia2 +Name=gpu File=/dev/nvidia3 diff --git a/files/etc/slurm-llnl/slurm.conf b/files/etc/slurm-llnl/slurm.conf index de57e6c..2c7923b 100644 --- a/files/etc/slurm-llnl/slurm.conf +++ b/files/etc/slurm-llnl/slurm.conf @@ -31,6 +31,7 @@ JobCredentialPublicCertificate=/etc/slurm-llnl/slurm.cert #Licenses=foo*4,bar MailProg=/usr/bin/mail MaxJobCount=25000 +MaxArraySize=32000 #MaxStepCount=40000 #MaxTasksPerNode=128 MpiDefault=none @@ -119,9 +120,9 @@ PreemptMode=OFF # # LOGGING AND ACCOUNTING DefaultStorageType=slurmdbd -DefaultStorageUser={{ pillar['slurm']['db']['user'] }} +DefaultStorageUser=slurm #DefaultStorageLoc=/var/log/slurm-llnl/job_completions.log -DefaultStorageHost={{ pillar['slurm']['controller'] }} +DefaultStorageHost=biomedia03 DefaultStoragePort=6819 AccountingStorageEnforce=associations,limits #AccountingStorageHost= @@ -132,11 +133,11 @@ AccountingStorageEnforce=associations,limits #AccountingStorageUser= AccountingStoreJobComment=YES ClusterName=biomediacluster -#DebugFlags= -#JobCompHost={{ pillar['slurm']['controller'] }} +DebugFlags=Gres +#JobCompHost=biomedia03 #JobCompLoc= -#JobCompUser={{ pillar['slurm']['db']['user'] }} -#JobCompPass={{ pillar['slurm']['db']['password'] }} +#JobCompUser=slurm +#JobCompPass=1BUy4eVv7X #JobCompPort= JobCompType=jobcomp/none JobAcctGatherFrequency=30 @@ -164,6 +165,7 @@ SlurmSchedLogFile=/var/log/slurm-llnl/sched.log # # # GRes configuration +# GresTypes=gpu GresTypes={{ ','.join(pillar['slurm']['gres']) }} # COMPUTE NODES {% for node, values in pillar['slurm']['nodes']['batch']['cpus'].items() %} @@ -177,13 +179,12 @@ NodeName={{ node }} RealMemory={{ values.mem }} CPUs={{ values.cores }} Gres={{ {% endfor %} # Partitions PartitionName=long Nodes={{ ','.join(pillar['slurm']['nodes']['batch']['cpus']) }} Default=YES MaxTime=43200 -PartitionName=short Nodes={{ ','.join(pillar['slurm']['nodes']['batch']['cpus']) }} Default=NO MaxTime=60 Priority=5000 -PartitionName=gpus Nodes={{ ','.join(pillar['slurm']['nodes']['batch']['gpus']) }} Default=NO MaxTime=10080 -PartitionName=interactive Nodes={{ ','.join(pillar['slurm']['nodes']['interactive']['cpus']) }} Default=NO MaxTime=4320 Priority=7000 PreemptMode=OFF +#PartitionName=short Nodes={{ ','.join(pillar['slurm']['nodes']['batch']['cpus']) }} Default=NO MaxTime=60 Priority=5000 +PartitionName=gpus Nodes={{ ','.join(pillar['slurm']['nodes']['batch']['gpus']) }} Default=NO MaxTime=10080 MaxCPUsPerNode=4 MaxMemPerNode=30720 +#PartitionName=interactive Nodes={{ ','.join(pillar['slurm']['nodes']['interactive']['cpus']) }} Default=NO MaxTime=4320 Priority=7000 PreemptMode=OFF -{% set rocsList = [] %} -{% for node, values in pillar['slurm']['nodes']['batch']['cpus'].items() %} {% if node.startswith('roc') %} {% set rocsListTrash = rocsList.append(node) %} {% endif %} {% endfor %} - -PartitionName=rocsLong Nodes={{ ','.join(rocsList) }} Default=NO MaxTime=43200 -PartitionName=rocsShort Nodes={{ ','.join(rocsList) }} Default=NO MaxTime=60 Priority=5000 +#{% set rocsList = [] %} +#{% for node, values in pillar['slurm']['nodes']['batch']['cpus'].items() %} {% if node.startswith('roc') %} {% set rocsListTrash = rocsList.append(node) %} {% endif %} {% endfor %} +#PartitionName=rocsLong Nodes={{ ','.join(rocsList) }} Default=NO MaxTime=43200 +#PartitionName=rocsShort Nodes={{ ','.join(rocsList) }} Default=NO MaxTime=60 Priority=5000 \ No newline at end of file diff --git a/files/etc/slurm-llnl/slurmdbd.conf b/files/etc/slurm-llnl/slurmdbd.conf index 2b65573..0e853f2 100644 --- a/files/etc/slurm-llnl/slurmdbd.conf +++ b/files/etc/slurm-llnl/slurmdbd.conf @@ -6,7 +6,7 @@ ArchiveSuspend=no #ArchiveScript=/usr/sbin/slurm.dbd.archive #AuthInfo=/var/run/munge/munge.socket.2 AuthType=auth/munge -DbdHost={{ pillar['slurm']['controller'] }} +DbdHost=biomedia03 DbdPort=6819 DebugLevel=info PurgeEventAfter=1month @@ -16,10 +16,10 @@ PurgeSuspendAfter=1month LogFile=/var/log/slurm-llnl/slurmdbd.log PidFile=/var/run/slurm-llnl/slurmdbd.pid SlurmUser=slurm -#StorageHost={{ pillar['slurm']['controller'] }} +#StorageHost=biomedia03 StorageHost=localhost StorageType=accounting_storage/mysql StoragePort=3306 -StorageLoc={{ pillar['slurm']['db']['name'] }} -StorageUser={{ pillar['slurm']['db']['user'] }} -StoragePass={{ pillar['slurm']['db']['password'] }} +StorageLoc=slurmdb +StorageUser=slurm +StoragePass=1BUy4eVv7X \ No newline at end of file diff --git a/get_slurm_ver.sls b/get_slurm_ver.sls new file mode 100644 index 0000000..d55c80e --- /dev/null +++ b/get_slurm_ver.sls @@ -0,0 +1,3 @@ +get slurmd ver: + cmd.run: + - name: dpkg -s slurmd |grep "^Version:" > /tmp/local_slurm_ver.txt diff --git a/files/etc/slurm-llnl/monal02/gres.conf b/gres.conf_gpu similarity index 75% rename from files/etc/slurm-llnl/monal02/gres.conf rename to gres.conf_gpu index f4049ec..ec8d738 100644 --- a/files/etc/slurm-llnl/monal02/gres.conf +++ b/gres.conf_gpu @@ -1,6 +1,8 @@ Name=gpu File=/dev/nvidia0 Name=gpu File=/dev/nvidia1 +Name=gpu File=/dev/nvidia2 Name=gpu File=/dev/nvidia3 Name=gpu File=/dev/nvidia4 Name=gpu File=/dev/nvidia5 +Name=gpu File=/dev/nvidia6 Name=gpu File=/dev/nvidia7 diff --git a/init.sls b/init.sls index 947bce0..a2de958 100644 --- a/init.sls +++ b/init.sls @@ -1,89 +1,141 @@ -# State munge has to be included to allow dependency on munge package -include: - - .munge - - .ubuntu-utopic - - .ssh - - .screen -{% if grains['host'] == pillar['slurm']['controller'] %} - - .slurmdbd -{% endif %} - -# The SLURM scheduling system +# munge.key must be the same across all the nodes of the cluster -/etc/slurm-llnl/slurm.conf: - file.managed: - - source: salt://slurm/files/etc/slurm-llnl/slurm.conf - - template: jinja - -/etc/slurm-llnl/cgroup.conf: - file.managed: - - source: salt://slurm/files/etc/slurm-llnl/cgroup.conf +munge: + pkg: + - installed + group.present: + - system: True + - gid: 98 + user.present: + - uid: 69 + - gid_from_name: True + - system: True + - shell: /bin/true + - createhome: False + service.running: + - name: munge + - enable: False + - require: + - file: /etc/munge/munge.key -/etc/slurm-llnl/slurm.cert: - file.managed: - - source: salt://slurm/files/etc/slurm-llnl/slurm.cert - - mode: 400 +#stop munge: +# cmd.run: +# - name: systemctl stop munge +/var/log/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge +/var/run/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge -# FIXME can the next 2 be factored? -/var/spool/slurm-llnl: +/var/lib/munge: file.directory: - - group: slurm - - user: slurm + - group: munge + - user: munge + - recurse: + - user + - group - require: - - user: slurm + - user: munge -/var/spool/slurm-llnl/cgroup: +/etc/munge: file.directory: - - group: slurm - - user: slurm + - group: munge + - user: munge + - recurse: + - user + - group - require: - - user: slurm + - user: munge -{% if grains['host'] != pillar['slurm']['controller'] %} -/var/log/slurm-llnl/slurm.log: - file.managed: - - group: slurm - - user: slurm +/run/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group - require: - - user: slurm -{% endif %} + - user: munge +#reload munge: +# cmd.run: +# - name: systemctl start munge -# specific to SLURM Controller -{% if grains['host'] == pillar['slurm']['controller'] %} -/etc/slurm-llnl/slurm.key: - file.managed: - - source: salt://slurm/files/etc/slurm-llnl/slurm.key - - mode: 400 - -## Unused because of slurmdbd -#/var/log/slurm-llnl/accounting.log: -# file.managed: -# - group: slurm -# - user: slurm -# - require: -# - user: slurm -# -#/var/log/slurm-llnl/job_completions.log: -# file.managed: -# - group: slurm -# - user: slurm + +#munge: +# pkg: +# - installed +# service.running: +# - enable: False +# group.present: +# - system: True +# - gid: 98 +# user.present: +# - uid: 69 +# - gid_from_name: True +# - system: True +# - shell: /bin/true +# - createhome: False +# service.running: +# - name: munge +# - watch: +# - user: munge +# - file: /etc/munge/munge.key # - require: -# - user: slurm +# - pkg: munge +# - user: munge +# - file: /etc/munge/munge.key -/var/log/slurm-llnl/slurmctld.log: - file.managed: - - group: slurm - - user: slurm - - require: - - user: slurm +#install munge: +# pkg: +# - installed +# service.running: +# - enable: False -/var/log/slurm-llnl/sched.log: +copy munge.key file: file.managed: - - group: slurm - - user: slurm - - require: - - user: slurm -{% endif %} + - name: /etc/munge/munge.key + - source: salt://munge.key + +#reload munge: +# service.running: +# - enable: True +# - reload: True +# - watch: +# - pkg: munge + + +starting munge: + cmd.run: + - name: systemctl start munge + + +install dependencies for Slurm: + cmd.run: + - name: aptitude install -y libipmimonitoring5a + +install slurm packages from local repo: + pkg.installed: + - sources: + - libhdf5: salt://slurm_deb/libhdf5-100_1.10.0-patch1+docs-3_amd64.deb + - libhwloc5: salt://slurm_deb/libhwloc5_1.11.5-1_amd64.deb + - libpng16: salt://slurm_deb/libpng16-16_1.6.28-1_amd64.deb + - libreadline7: salt://slurm_deb/libreadline7_7.0-0ubuntu2_amd64.deb + - librrd8: salt://slurm_deb/librrd8_1.6.0-1_amd64.deb + - slurm-wlm-basic-plugins: salt://slurm_deb/slurm-wlm-basic-plugins_16.05.9-1ubuntu1_amd64.deb + - slurmd: salt://slurm_deb/slurmd_16.05.9-1ubuntu1_amd64.deb # TODO handle different names given distro (slurm, slurm-llnl, ...) @@ -93,37 +145,86 @@ slurm: - gid: 97 user.present: - fullname: SLURM daemon user account - - uid: 97 + - uid: 14 - gid_from_name: True - system: True - home: /var/spool/slurm-llnl - shell: /bin/true - pkg.installed: - - name: slurm-llnl - service.running: - - name: slurm-llnl - - watch: - - user: slurm - - pkg: slurm - - pkg: slurm-plugins - - pkg: munge - - file: /etc/slurm-llnl/slurm.conf - - file: /var/spool/slurm-llnl -{% if grains['host'] == pillar['slurm']['controller'] %} - - file: /var/log/slurm-llnl/sched.log - - file: /var/log/slurm-llnl/slurmctld.log - - pkg: slurmdbd -{% endif %} +# service.running: +# - name: slurmd +# - watch: +# - require: +# - pkg: slurmd +# - user: slurm +# - file: /etc/slurm-llnl/slurm.conf +# - file: /var/spool/slurm-llnl -slurm-plugins: - pkg.installed: - - name: slurm-llnl-basic-plugins + +# The SLURM scheduling system + +/etc/slurm-llnl/slurm.conf: + file.managed: + - name: /etc/slurm-llnl/slurm.conf + - source: salt://files/etc/slurm-llnl/slurm.conf + +/etc/slurm-llnl/cgroup.conf: + file.managed: + - name: /etc/slurm-llnl/cgroup.conf + - source: salt://files/etc/slurm-llnl/cgroup.conf /etc/slurm-llnl/gres.conf: file.managed: -{% if grains['host'] in [ "bardolph", "monal01", "monal02"] %} - - source: salt://slurm/files/etc/slurm-llnl/{{ grains['host'] }}/gres.conf -{% else %} - - source: salt://slurm/files/etc/slurm-llnl/default/gres.conf + - name: /etc/slurm-llnl/gres.conf +{% if grains['host'] in ['monal01', 'monal02'] %} + - source: salt://files/etc/slurm-llnl/gres_gpu.conf +{% elif grains['host'] in ['monal03'] %} + - source: salt://files/etc/slurm-llnl/gres_gpu_l.conf +{% else %} + - source: salt://files/etc/slurm-llnl/gres.conf {% endif %} + +/etc/slurm-llnl/slurm.cert: + file.managed: + - name: /etc/slurm-llnl/slurm.cert + - source: salt://slurm.cert + +/etc/slurm-llnl/slurm.key: + file.managed: + - name: /etc/slurm-llnl/slurm.key + - source: salt://slurm.key + + +/var/spool/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + + +/var/run/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +/var/log/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm +reload slurmd: + cmd.run: + - name: systemctl restart slurmd diff --git a/pillar.example b/pillar.example index a28ce73..58df2cf 100644 --- a/pillar.example +++ b/pillar.example @@ -1,38 +1,23 @@ slurm: - controller: predict5 + controller: biomedia03 nodes: batch: gpus: - bardolph: - mem: 15972 - cores: 12 + monal01: + mem: 80000 + cores: 40 gres: - gpu: 2 - + gpu: 8 + monal02: + mem: 80000 + cores: 40 + gres: + gpu: 8 cpus: - predict1: - mem: 30160 - cores: 8 - {% for N in range(2,4) %} - predict{{N}}: - mem: 32176 - cores: 8 - {% endfor %} - {% for N in range(6,9,2) %} - predict{{N}}: - mem: 32176 - cores: 8 - {% endfor %} - - {% for N in range(1,4,2) %} + {% for N in [1,2,5] %} biomedia0{{N}}: - mem: 64417 - cores: 24 - {% endfor %} - {% for N in range(2,6,3) %} - biomedia0{{N}}: - mem: 64417 + mem: 63800 cores: 24 {% endfor %} {% for N in range(6,10) %} @@ -40,12 +25,12 @@ slurm: mem: 128850 cores: 64 {% endfor %} - biomedia10: - mem: 128851 + biomedia10: + mem: 128850 cores: 24 - {% for N in range(4,17) %} + {% for N in range(1,17) %} roc{{ "%02d" % N }}: - mem: 257875 + mem: 257869 cores: 32 {% endfor %} @@ -54,13 +39,7 @@ slurm: biomedia11: mem: 257919 cores: 32 - {% for N in range(1,4) %} - roc{{ "%02d" % N }}: - mem: 257906 - cores: 32 - {% endfor %} - - + gres: - gpu diff --git a/reconfig_cpu_nodes.sls b/reconfig_cpu_nodes.sls new file mode 100644 index 0000000..5264aa6 --- /dev/null +++ b/reconfig_cpu_nodes.sls @@ -0,0 +1,153 @@ +# for reconfiguring CPU nodes only +# check munge.key +munge: + pkg: + - installed + group.present: + - system: True + - gid: 98 + user.present: + - uid: 69 + - gid_from_name: True + - system: True + - shell: /bin/true + - createhome: False + service.running: + - name: munge + - require: + - pkg: munge + - user: munge + - file: /etc/munge/munge.key + +/var/log/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge + +/run/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge + +/var/lib/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge +precopy munge.key file: + file.managed: + - name: /tmp/master_munge.key + - source: salt://slurm_conf_files/munge.key + +#check if munge.key are different: +{% if salt['cmd.run'](" diff /tmp/master_munge.key /etc/munge/munge.key" ) %} + +munge key diff output: + cmd.run: + - name: echo "munge keys are different. New key is to be copied from Salt master" + +copy munge.key file: + file.managed: + - name: /etc/munge/munge.key + - source: salt://slurm_conf_files/munge.key +munge re-start: + cmd.run: + - name: systemctl restart munge + +{% else %} +munge key no diff output: + cmd.run: + - name: echo "munge key is identical with Salt master, no further action needed." +{% endif %} + +# === check slurm === +copy slurm controller ver: + file.managed: + - name: /tmp/slurm_master_ver.txt + - source: salt://slurm_conf_files/slurm_ver.txt + +# TODO handle different names given distro (slurm, slurm-llnl, ...) +slurm: + group.present: + - system: True + - gid: 97 + user.present: + - fullname: SLURM daemon user account + - uid: 14 + - gid_from_name: True + - system: True + - home: /var/spool/slurm-llnl + - shell: /bin/true + service.running: + - name: slurmd + - require: + - file: /etc/slurm-llnl/slurm.conf + +# The SLURM scheduling system + +/etc/slurm-llnl/slurm.conf: + file.managed: + - name: /etc/slurm-llnl/slurm.conf + - source: salt://slurm_conf_files/slurm.conf + +/etc/slurm-llnl/cgroup.conf: + file.managed: + - name: /etc/slurm-llnl/cgroup.conf + - source: salt://slurm_conf_files/cgroup.conf + +/etc/slurm-llnl/gres.conf: + file.managed: + - name: /etc/slurm-llnl/gres.conf + +/etc/slurm-llnl/slurm.cert: + file.managed: + - name: /etc/slurm-llnl/slurm.cert + - source: salt://slurm_conf_files/slurm.cert + +/var/spool/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +/var/run/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +/var/log/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +slurmd re-start: + cmd.run: + - name: systemctl restart slurmd diff --git a/reconfig_gpu_nodes.sls b/reconfig_gpu_nodes.sls new file mode 100644 index 0000000..26f722f --- /dev/null +++ b/reconfig_gpu_nodes.sls @@ -0,0 +1,38 @@ +# for reconfiguring GPU nodes only to update the slurm.conf + +# TODO handle different names given distro (slurm, slurm-llnl, ...) +slurm: + group.present: + - system: True + - gid: 97 + user.present: + - fullname: SLURM daemon user account + - uid: 14 + - gid_from_name: True + - system: True + - home: /var/spool/slurm-llnl + - shell: /bin/true + service.running: + - name: slurmd + - require: + - file: /etc/slurm-llnl/slurm.conf + +# The SLURM scheduling system + +pre copy the slurm.conf: + file.managed: + - name: /tmp/slurm.conf + - source: salt://slurm_conf_files/slurm.conf + +{% if salt['cmd.run'](' diff /tmp/slurm.conf /etc/slurm-llnl/slurm.conf ') %} + +/etc/slurm-llnl/slurm.conf: + file.managed: + - name: /etc/slurm-llnl/slurm.conf + - source: salt://slurm_conf_files/slurm.conf + +slurmd re-start: + cmd.run: + - name: systemctl restart slurmd + +{% endif %} diff --git a/remove_old_slurm.sh b/remove_old_slurm.sh new file mode 100644 index 0000000..ee17b7c --- /dev/null +++ b/remove_old_slurm.sh @@ -0,0 +1,14 @@ +#!/bin/bash +installed_pkgs=$(dpkg -l|grep slurm |awk '{print $2}') +while [ "$installed_pkgs" ]; do + for pkgs in $installed_pkgs; do + echo $pkgs + { #try + apt-get remove --purge $pkgs + } || { #catch + dpkg -r $pkgs + dpkg -P $pkgs + } + done + installed_pkgs=$(dpkg -l|grep slurm |awk '{print $2}') +done diff --git a/slurm.conf b/slurm.conf new file mode 100644 index 0000000..396e1d8 --- /dev/null +++ b/slurm.conf @@ -0,0 +1,200 @@ +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +# Workaround because Slurm does not recognize full hostname... +ControlMachine=biomedia03 +#ControlAddr= +#BackupController= +#BackupAddr= +# +AuthType=auth/munge +CacheGroups=0 +#CheckpointType=checkpoint/none +CryptoType=crypto/munge +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=999999 +#GresTypes= +#GroupUpdateForce=0 +#GroupUpdateTime=600 +#JobCheckpointDir=/var/slurm/checkpoint +JobCredentialPrivateKey=/etc/slurm-llnl/slurm.key +JobCredentialPublicCertificate=/etc/slurm-llnl/slurm.cert +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=1 +#KillOnBadExit=0 +#Licenses=foo*4,bar +MailProg=/usr/bin/mail +MaxJobCount=25000 +MaxArraySize=32000 +#MaxStepCount=40000 +#MaxTasksPerNode=128 +MpiDefault=none +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +ReturnToService=2 +#SallocDefaultCommand= +SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurm-llnl +SlurmUser=slurm +#SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool/slurm-llnl +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/cgroup +TaskPluginParam=Sched +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFs=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +UsePAM=1 +# +# +# TIMERS +#BatchStartTimeout=10 +#CompleteWait=0 +#EpilogMsgTime=2000 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=7200 +KillWait=30 +#MessageTimeout=10 +#ResvOverRun=0 +MinJobAge=300 +#OverTimeLimit=0 +SlurmctldTimeout=120 +SlurmdTimeout=300 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +DefMemPerCPU=2048 +FastSchedule=1 +#MaxMemPerCPU=0 +#SchedulerRootFilter=1 +#SchedulerTimeSlice=30 +SchedulerType=sched/builtin +SchedulerPort=7321 +SelectType=select/cons_res +#SelectTypeParameters=CR_Core_Memory +SelectTypeParameters=CR_CPU_Memory +# +# +# JOB PRIORITY +PriorityType=priority/multifactor +PriorityDecayHalfLife=7-0 +PriorityCalcPeriod=5 +PriorityUsageResetPeriod=NONE +PriorityFavorSmall=YES +PriorityMaxAge=7-0 +PriorityWeightAge=2000 +PriorityWeightFairshare=10000 +PriorityWeightJobSize=3000 +PriorityWeightPartition=1000 +PriorityWeightQOS=0 +# +# QUEUE PRE-EMPTION SETTINGS +PreemptType=preempt/none +PreemptMode=OFF +# +# LOGGING AND ACCOUNTING +DefaultStorageType=slurmdbd +DefaultStorageUser=slurm +#DefaultStorageLoc=/var/log/slurm-llnl/job_completions.log +DefaultStorageHost=biomedia03 +DefaultStoragePort=6819 +AccountingStorageEnforce=associations,limits +#AccountingStorageHost= +#AccountingStorageLoc=/var/log/slurm-llnl/accounting.log +#AccountingStoragePass= +#AccountingStoragePort= +#AccountingStorageType=accounting_storage/filetxt +#AccountingStorageUser= +AccountingStoreJobComment=YES +ClusterName=biomediacluster +DebugFlags=Gres +#JobCompHost=biomedia03 +#JobCompLoc= +#JobCompUser=slurm +#JobCompPass=1BUy4eVv7X +#JobCompPort= +JobCompType=jobcomp/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/linux +Proctracktype=proctrack/cgroup +# +SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log +SlurmdDebug=info +SlurmdLogFile=/var/log/slurm-llnl/slurm.log +SlurmSchedLogFile=/var/log/slurm-llnl/sched.log +#SlurmSchedLogLevel= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# GRes configuration +GresTypes=gpu +# COMPUTE NODES + +NodeName=biomedia[01,02,05] RealMemory=64000 CPUs=24 State=UNKNOWN + +#NodeName=biomedia02 RealMemory=63800 CPUs=24 State=UNKNOWN + +#NodeName=biomedia05 RealMemory=64414 CPUs=24 State=UNKNOWN + +NodeName=biomedia0[6-9] RealMemory=118000 CPUs=64 State=UNKNOWN + +NodeName=biomedia10 RealMemory=118000 CPUs=24 State=UNKNOWN + +NodeName=biomedia11 RealMemory=252000 CPUs=32 State=UNKNOWN + +NodeName=roc0[1-9] RealMemory=243000 CPUs=32 State=UNKNOWN + +NodeName=roc[10-16] RealMemory=243000 CPUs=32 State=UNKNOWN + +NodeName=monal0[1-2] RealMemory=210000 CPUs=40 Gres=gpu:8 State=UNKNOWN + +NodeName=monal03 RealMemory=220000 CPUs=56 Gres=gpu:4 State=UNKNOWN + +# Partitions +PartitionName=long Nodes=biomedia0[1-2],biomedia05,biomedia0[6-10],roc0[4-10],roc[11-16] Default=YES MaxTime=43200 +PartitionName=rocsLong Nodes=roc0[1-3] Default=NO MaxTime=43200 + +PartitionName=gpus Nodes=monal01,monal02,monal03 Default=NO MaxTime=10080 +PartitionName=interactive Nodes=biomedia11 Default=NO MaxTime=4320 Priority=7000 PreemptMode=OFF + +#PartitionName=rocsLong Nodes= Default=NO MaxTime=43200 +#PartitionName=rocsShort Nodes= Default=NO MaxTime=60 Priority=5000 + diff --git a/tools/add_users_to_slurm_groups.sh b/tools/add_users_to_slurm_groups.sh deleted file mode 100755 index 0b33e64..0000000 --- a/tools/add_users_to_slurm_groups.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -for i in `grep biomedia /etc/group | cut -d ':' -f 4 | sed -e 's/,/ /g' `; do yes | sacctmgr add user $i DefaultAccount=biomedia; done diff --git a/tools/install_salt-minion.sh b/tools/install_salt-minion.sh deleted file mode 100755 index 036168f..0000000 --- a/tools/install_salt-minion.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -#### -# -# Initial installation of salt minion packages and configuration files -# from the salt master. -# -#### - -ssh -oStrictHostKeyChecking=no $1 aptitude -y install salt-minion -scp -oStrictHostKeyChecking=no /etc/salt/minion $1:/etc/salt/ -ssh -oStrictHostKeyChecking=no $1 /etc/init.d/salt-minion restart - diff --git a/ubuntu-utopic.sls b/ubuntu-utopic.sls deleted file mode 100644 index e007f15..0000000 --- a/ubuntu-utopic.sls +++ /dev/null @@ -1,17 +0,0 @@ -# Ubuntu Utopic required for Slurm 2.6.7 - -/etc/apt/preferences.d/disable-utopic-policy: - file.managed: - - source: salt://slurm/files/etc/apt/preferences.d/disable-utopic-policy - - mode: 644 - -/etc/apt/preferences.d/slurm-utopic-policy: - file.managed: - - source: salt://slurm/files/etc/apt/preferences.d/slurm-utopic-policy - - mode: 644 - -/etc/apt/sources.list.d/utopic.list: - file.managed: - - source: salt://slurm/files/etc/apt/sources.list.d/utopic.list - - mode: 644 - diff --git a/validate_slurm.sls b/validate_slurm.sls new file mode 100644 index 0000000..70ffac5 --- /dev/null +++ b/validate_slurm.sls @@ -0,0 +1,191 @@ + +# check munge.key +munge: + pkg: + - installed + group.present: + - system: True + - gid: 98 + user.present: + - uid: 98 + - gid_from_name: True + - system: True + - shell: /bin/true + - createhome: False + service.running: + - name: munge + - require: + - pkg: munge + - user: munge + - file: /etc/munge/munge.key + +/var/log/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge + +/run/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge + +/var/lib/munge: + file.directory: + - group: munge + - user: munge + - recurse: + - user + - group + - require: + - user: munge +precopy munge.key file: + file.managed: + - name: /tmp/master_munge.key + - source: salt://slurm_conf_files/munge.key + +#check if munge.key are different: +{% if salt['cmd.run'](" diff /tmp/master_munge.key /etc/munge/munge.key" ) %} + +munge key diff output: + cmd.run: + - name: echo "munge keys are different. New key is to be copied from Salt master" + +copy munge.key file: + file.managed: + - name: /etc/munge/munge.key + - source: salt://slurm_conf_files/munge.key +munge re-start: + cmd.run: + - name: systemctl restart munge + +{% else %} +munge key no diff output: + cmd.run: + - name: echo "munge key is identical with Salt master, no further action needed." +{% endif %} + +# === check slurm === +copy slurm controller ver: + file.managed: + - name: /tmp/slurm_master_ver.txt + - source: salt://slurm_conf_files/slurm_ver.txt + +# do the following get slurm ver separately to guarantee it runs properly +#get slurmd ver: +# cmd.run: +# - name: dpkg -s slurmd |grep "^Version:" > /tmp/local_slurm_ver.txt + +#check versions: +{% if salt['cmd.run'](" diff /tmp/slurm_master_ver.txt /tmp/local_slurm_ver.txt ") %} +check ver output: + cmd.run: + - name: echo "slurm versions are different, old ones are to be removed, new ones are installing" + +copy sh to remove old slurm: + file.managed: + - name: /tmp/remove_old_slurm.sh + - source: salt://remove_old_slurm.sh + - mode: 777 + +remove old slurm: + cmd.run: + - name: /tmp/remove_old_slurm.sh + +install slurm packages from local repo: + pkg.installed: + - sources: + - libhdf5: salt://slurm_deb/libhdf5-100_1.10.0-patch1+docs-3_amd64.deb + - libhwloc5: salt://slurm_deb/libhwloc5_1.11.5-1_amd64.deb + - libpng16: salt://slurm_deb/libpng16-16_1.6.28-1_amd64.deb + - libreadline7: salt://slurm_deb/libreadline7_7.0-0ubuntu2_amd64.deb + - librrd8: salt://slurm_deb/librrd8_1.6.0-1_amd64.deb + - slurm-wlm-basic-plugins: salt://slurm_deb/slurm-wlm-basic-plugins_16.05.9-1ubuntu1_amd64.deb + - slurmd: salt://slurm_deb/slurmd_16.05.9-1ubuntu1_amd64.deb + +{% else %} +ver no diff output: + cmd.run: + - name: echo "Slurm version is same as Slurm master, no need to install Slurm" +{% endif %} + +# TODO handle different names given distro (slurm, slurm-llnl, ...) +slurm: + group.present: + - system: True + - gid: 97 + user.present: + - fullname: SLURM daemon user account + - uid: 97 + - gid_from_name: True + - system: True + - home: /var/spool/slurm-llnl + - shell: /bin/true + service.running: + - name: slurmd + - require: + - file: /etc/slurm-llnl/slurm.conf + +# The SLURM scheduling system + +/etc/slurm-llnl/slurm.conf: + file.managed: + - name: /etc/slurm-llnl/slurm.conf + - source: salt://slurm_conf_files/slurm.conf + +/etc/slurm-llnl/cgroup.conf: + file.managed: + - name: /etc/slurm-llnl/cgroup.conf + - source: salt://slurm_conf_files/cgroup.conf + +/etc/slurm-llnl/gres.conf: + file.managed: + - name: /etc/slurm-llnl/gres.conf + +/etc/slurm-llnl/slurm.cert: + file.managed: + - name: /etc/slurm-llnl/slurm.cert + - source: salt://slurm_conf_files/slurm.cert + +/var/spool/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +/var/run/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +/var/log/slurm-llnl: + file.directory: + - group: slurm + - user: slurm + - recurse: + - user + - group + - require: + - user: slurm + +slurmd re-start: + cmd.run: + - name: systemctl restart slurmd