Skip to content

Commit

Permalink
1. Specified to use version 1.10.0 of openstack.cloud ansible collect…
Browse files Browse the repository at this point in the history
…ion to improve compatibility

2. Updated to use rocky 8.7 as base OS.
3. Configured the cluster to use gcc 12
4. Improve slurm’s reliability in launching cloud instances.
5. Configured compute nodes to send syslog to the head node to facilitate debugging
6. Fixed a syntax error in slurm.conf
  • Loading branch information
shl1 committed Jan 27, 2023
1 parent c89f255 commit d76c222
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 32 deletions.
23 changes: 15 additions & 8 deletions create_compute_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
openstack.cloud.image_info:
image: "{{ compute_node_image }}"
register: compute_image_info
- fail:

- fail:
msg: "{{ compute_node_image }} already exists."
when: compute_image_info.openstack_image
when: compute_image_info.openstack_images

- name: Get size of the image
os_image_info:
image: "{{ image_name }}"
register: image_info

- name: Create boot volume for compute imaging instance
os_volume:
os_volume:
name: "{{ compute_node_image }}"
image: "{{ image_name }}"
size: "{{ (image_info.openstack_image.size/1024/1024/1024) | int }}"
size: "{{ (image_info.openstack_images[0].size/1024/1024/1024) | int }}"
bootable: true
state: present

Expand Down Expand Up @@ -97,6 +97,14 @@
permanent: true
immediate: true

- name: Configure compute node to send syslog to head node
blockinfile:
path: "/etc/rsyslog.conf"
insertbefore: EOF
block: |
*.* @{{ hostvars['localhost'].headnode.openstack_servers[0].private_v4 }}:514
action(type="omfwd" Target="{{ hostvars['localhost'].headnode.openstack_servers[0].private_v4 }}" Port="514" Protocol="udp")
- name: Add home directory mount to /etc/fstab
ansible.posix.mount:
path: /home
Expand All @@ -111,7 +119,6 @@
owner: root
group: root
mode: 0755
recurse: yes

- name: Add /opt/ohpc/pub to /etc/fstab
ansible.posix.mount:
Expand Down Expand Up @@ -192,7 +199,7 @@
gather_facts: false
vars_files:
- ./vars/main.yml

tasks:
- name: Delete compute imaging instance
openstack.cloud.server:
Expand All @@ -212,7 +219,7 @@
name: "{{ compute_node_image }}"
volume: "{{ volumes_info.volumes[0].id }}"
disk_format: raw
timeout: 600
timeout: 1200
state: present

- name: Delete compute image volume
Expand Down
11 changes: 11 additions & 0 deletions provision_headnode.yml
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,14 @@
src: /etc/munge/munge.key
dest: ./files/munge.key
flat: yes

- name: Configure rsyslog to accept syslog from compute nodes
blockinfile:
path: /etc/rsyslog.d/ohpc.conf
insertbefore: EOF
create: true
state: present
block: |
module(load="imudp")
input(type="imudp" port="514")
notify: restart rsyslog
3 changes: 2 additions & 1 deletion requirements.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
---
collections:
- openstack.cloud
- name: openstack.cloud
version: 1.10.0
- ansible.posix
- community.general

Expand Down
74 changes: 74 additions & 0 deletions tasks/install_oneapi.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---
- name: Add oneAPI repo
dnf:
name: intel-oneapi-toolkit-release-ohpc
state: present

- name: Import GPG key for oneAPI repo
rpm_key:
state: present
key: https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB

- name: Install HPC kit
dnf:
name: intel-hpckit
state: present

- name: Integrate Intel compilers and Intel MPI with OpenHPC
dnf:
name:
- intel-compilers-devel-ohpc
- intel-mpi-devel-ohpc
state: present

- name: Install blacs95
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; cd $MKLROOT/interfaces/blas95; make CFLAGS="-fPIC" libintel64 INSTALL_DIR=$MKLROOT'
creates: "/opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_blas95_lp64.a"

- name: Install fftw2xc
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw2xc; make CFLAGS="-fPIC" libintel64'
creates: "/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw2xc_double_intel.a"

- name: Install fftw2xf
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw2xf; make CFLAGS="-fPIC" libintel64'
creates: "/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw2xf_double_intel.a"

- name: Install fftw2x_cdft
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; source /opt/intel/oneapi/mpi/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw2x_cdft; make CFLAGS="-fPIC" libintel64 mpi=intelmpi compiler=intel'
creates: '/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw2x_cdft_DOUBLE_lp64.a'

- name: Install fftw3xc
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw3xc; make CFLAGS="-fPIC" libintel64 compiler=intel MKLROOT=$MKLROOT INSTALL_DIR=$MKLROOT/lib/intel64'
creates: '/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw3xc_intel.a'

- name: Install fftw3xf
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw3xf; make CFLAGS="-fPIC" libintel64 compiler=intel MKLROOT=$MKLROOT INSTALL_DIR=$MKLROOT/lib/intel64'
creates: '/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw3xf_intel.a'

- name: Install fftw3x_cdft
shell:
cmd: 'source /opt/intel/oneapi/compiler/latest/env/vars.sh; source /opt/intel/oneapi/mkl/latest/env/vars.sh; source /opt/intel/oneapi/mpi/latest/env/vars.sh; cd $MKLROOT/interfaces/fftw3x_cdft; make CFLAGS="-fPIC" libintel64 mpi=intelmpi compiler=intel'
creates: '/opt/intel/oneapi/mkl/latest/lib/intel64/libfftw2x_cdft_DOUBLE_lp64.a'

- name: Install the rest of OpenHPC packages for Intel toolchain
dnf:
name:
- ohpc-intel-io-libs
- ohpc-intel-impi-io-libs
- ohpc-intel-perf-tools
- ohpc-intel-python3-libs
- ohpc-intel-impi-parallel-libs
- ohpc-intel-openmpi4-parallel-libs
- python3-mpi4py-intel-impi-ohpc
- pnetcdf-intel-impi-ohpc
- gsl-intel-ohpc
- boost-intel-impi-ohpc
- hypre-intel-impi-ohpc
- scalapack-intel-impi-ohpc
state: present
2 changes: 1 addition & 1 deletion templates/slurm.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld/
StateSaveLocation=/var/spool/slurmctld/
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
Expand Down
21 changes: 19 additions & 2 deletions templates/slurm_resume.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,29 @@ echo "Node resume invoked: $0 $*" >> $log_loc
for host in $(scontrol show hostname $1)
do

(echo "creating $host" >> $log_loc;
(echo "resuming $host" >> $log_loc;
if [[ $(/usr/local/bin/openstack server show -c status -f value $host | wc -l) -gt 0 ]]
then
echo "clean up $host" >> $log_loc;
/usr/local/bin/openstack server delete $host;
sleep 15;
fi;
volume_status=$(/usr/local/bin/openstack volume show -c status -f value $host);
while [[ $volume_status == 'available' ]] || [[ $volume_status == 'error' ]]; do
delete_result=$(/usr/local/bin/openstack volume delete $host);
echo "Clean up old volume $host: ${delete_result}" >> $log_loc;
volume_status=$(/usr/local/bin/openstack volume show -c status -f value $host);
sleep 5;
done;
/usr/local/bin/openstack volume create \
--image {{ compute_node_image }} \
--size {{ compute_node_disk_size_gb }} $host >> $log_loc;
until [[ $(/usr/local/bin/openstack volume show -c status -f value $host) == 'available' ]]; do
echo "Wait for volume $host to be ready" >> $log_loc;
sleep 1;
done;
/usr/local/bin/openstack server create $host \
--flavor {{ compute_node_flavor }} \
--flavor {{compute_node_flavor}} \
--volume $host \
--key-name {{ keypair_name }} \
--security-group {{ cluster_security_group }} \
Expand Down
40 changes: 20 additions & 20 deletions vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# Cluster general info
#
cluster_name: "cluster1"
image_name: "rocky-8.5"
image_init_user: "cloud-user"
image_name: "rocky-8.7"
image_init_user: "rocky"
yum_repo_prefix:
Rocky: "Rocky"
CentOS: "CentOS-Linux"
Expand All @@ -19,28 +19,28 @@ head_node_flavor: c1.m8
head_node_disk_size_gb: 50
head_node_user_packages:
- ohpc-autotools
- ohpc-gnu9-io-libs
- ohpc-gnu9-openmpi4-io-libs
- ohpc-gnu9-openmpi4-parallel-libs
- ohpc-gnu9-parallel-libs
- ohpc-gnu9-python3-libs
- gnu9-compilers-ohpc
- openmpi4-gnu9-ohpc
- lmod-defaults-gnu9-openmpi4-ohpc
- ohpc-gnu12-io-libs
- ohpc-gnu12-openmpi4-io-libs
- ohpc-gnu12-openmpi4-parallel-libs
- ohpc-gnu12-parallel-libs
- ohpc-gnu12-python3-libs
- gnu12-compilers-ohpc
- openmpi4-gnu12-ohpc
- lmod-defaults-gnu12-openmpi4-ohpc
- automake-ohpc
- autoconf-ohpc
- cmake-ohpc
- libtool-ohpc
- python3-scipy-gnu9-openmpi4-ohpc
- python3-numpy-gnu9-ohpc
- python3-mpi4py-gnu9-openmpi4-ohpc
- pnetcdf-gnu9-openmpi4-ohpc
- gsl-gnu9-ohpc
- openblas-gnu9-ohpc
- boost-gnu9-openmpi4-ohpc
- fftw-gnu9-openmpi4-ohpc
- hypre-gnu9-openmpi4-ohpc
- scalapack-gnu9-openmpi4-ohpc
- python3-scipy-gnu12-openmpi4-ohpc
- python3-numpy-gnu12-ohpc
- python3-mpi4py-gnu12-openmpi4-ohpc
- pnetcdf-gnu12-openmpi4-ohpc
- gsl-gnu12-ohpc
- openblas-gnu12-ohpc
- boost-gnu12-openmpi4-ohpc
- fftw-gnu12-openmpi4-ohpc
- hypre-gnu12-openmpi4-ohpc
- scalapack-gnu12-openmpi4-ohpc
mysql_user: slurm
mysql_password: slurmdb
# Set install_intel_oneapi to true to install Intel OneAPI (minimum head_node_disk_size_gb 60 )
Expand Down

0 comments on commit d76c222

Please sign in to comment.