From c9e2b50cfd9e4a060cad753373b1759e47c19e34 Mon Sep 17 00:00:00 2001 From: Manaure Francisquez Date: Thu, 7 Mar 2024 15:26:34 -0800 Subject: [PATCH] Update perlmutter machine files. The problem was we were not using cray mpich, and were not disabling cuda-aware MPI. Now sims appear to run reliably. --- machines/configure.perlmutter.gpu.sh | 17 ++++++++-- machines/jobScript.perlmutter-gpu | 51 ++++++++++++++++++++++++++++ machines/mkdeps.perlmutter.gpu.sh | 17 ++++++++-- 3 files changed, 79 insertions(+), 6 deletions(-) create mode 100644 machines/jobScript.perlmutter-gpu diff --git a/machines/configure.perlmutter.gpu.sh b/machines/configure.perlmutter.gpu.sh index c9e46d6d8..5645c0372 100755 --- a/machines/configure.perlmutter.gpu.sh +++ b/machines/configure.perlmutter.gpu.sh @@ -1,8 +1,19 @@ +#.MF 2024/03/07: At the time we got this to work I had the following modules loaded +#. 1) craype-x86-milan 7) cpe/23.12 13) craype-accel-nvidia80 +#. 2) libfabric/1.15.2.0 8) gpu/1.0 14) cray-mpich/8.1.28 (mpi) +#. 3) craype-network-ofi 9) craype/2.7.30 (c) 15) cudatoolkit/12.0 (g) +#. 4) xpmem/2.6.2-2.5_2.38__gd067c3f.shasta 10) cray-dsmml/0.2.2 16) nccl/2.18.3-cu12 +#. 5) gcc-native/12.3 11) cray-libsci/23.12.5 (math) +#. 6) perftools-base/23.12.0 12) PrgEnv-gnu/8.5.0 (cpe) +#.Most of these are loaded by default, so we just load some extra/key ones here. +module load PrgEnv-gnu/8.5.0 +module load craype-accel-nvidia80 +module load cray-mpich/8.1.28 module load cudatoolkit/12.0 -module load openmpi/5.0.0rc12 module load nccl/2.18.3-cu12 -module unload darshan + : "${PREFIX:=$HOME/gkylsoft}" -./configure CC=nvcc ARCH_FLAGS="-march=native" CUDA_ARCH=80 --prefix=$PREFIX --lapack-inc=$PREFIX/OpenBLAS/include --lapack-lib=$PREFIX/OpenBLAS/lib/libopenblas.a --superlu-inc=$PREFIX/superlu/include --superlu-lib=$PREFIX/superlu/lib/libsuperlu.a --cudamath-libdir=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/lib64 --use-mpi=yes --mpi-inc=$OPENMPI_ROOT/include --mpi-lib=$OPENMPI_ROOT/lib --use-nccl=yes --nccl-inc=$NCCL_DIR/include --nccl-lib=$NCCL_DIR/lib; + +./configure CC=nvcc ARCH_FLAGS="-march=native" CUDA_ARCH=80 --prefix=$PREFIX --lapack-inc=$PREFIX/OpenBLAS/include --lapack-lib=$PREFIX/OpenBLAS/lib/libopenblas.a --superlu-inc=$PREFIX/superlu/include --superlu-lib=$PREFIX/superlu/lib/libsuperlu.a --cudamath-libdir=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/lib64 --use-mpi=yes --mpi-inc=$CRAY_MPICH_DIR/include --mpi-lib=$CRAY_MPICH_DIR/lib --use-nccl=yes --nccl-inc=$NCCL_DIR/include --nccl-lib=$NCCL_DIR/lib; diff --git a/machines/jobScript.perlmutter-gpu b/machines/jobScript.perlmutter-gpu new file mode 100644 index 000000000..39aaa2d9e --- /dev/null +++ b/machines/jobScript.perlmutter-gpu @@ -0,0 +1,51 @@ +#!/bin/bash -l + +#.Declare a name for this job, preferably with 16 or fewer characters. +#SBATCH -J +#SBATCH -A + +#.Request the queue (enter the possible names, if omitted, default is the default) +#.this job is going to use the default +#SBATCH -q regular + +#.Number of nodes to request (Perlmutter has 64 cores and 4 GPUs per node) +#SBATCH -N 2 +#SBATCH --ntasks 8 + +#.Specify GPU needs: +#SBATCH --constraint gpu +#SBATCH --gpus 8 + +#.Request wall time +#SBATCH -t 00:30:00 + +#.Mail is sent to you when the job starts and when it terminates or aborts. +#SBATCH --mail-user= +#SBATCH --mail-type=END,FAIL,REQUEUE + +#.Load modules (this must match those in the machines/configure script). +module load PrgEnv-gnu/8.5.0 +module load craype-accel-nvidia80 +module load cray-mpich/8.1.28 +module load cudatoolkit/12.0 +module load nccl/2.18.3-cu12 + +#.On Perlmutter some jobs get warnings about DVS_MAXNODES (used in file stripping). +#.We set it to 24 for now, but really this depends on the amount/size of I/O being performed. +#.See online NERSC docs and the intro_mpi man page. +export DVS_MAXNODES=24_ +export MPICH_MPIIO_DVS_MAXNODES=24 + +#.Run the rt_gk_sheath_2x2v_p1 executable using 1 GPU along x (-c 1) and 8 +#.GPUs along the field line (-d 8). See './rt_gk_sheath_2x2v_p1 -h' for +#.more details/options on decomposition. It also assumes the executable is +#.in the present directory. If it isn't, change `./` to point to the +#.directory containing the executable. + +echo "srun -u -n 8 --gpus 8 ./rt_gk_sheath_2x2v_p1 -g -M -c 1 -d 8" +srun -u -n 8 --gpus 8 ./rt_gk_sheath_2x2v_p1 -g -M -c 1 -d 8 + + + + + diff --git a/machines/mkdeps.perlmutter.gpu.sh b/machines/mkdeps.perlmutter.gpu.sh index 884f42739..7a34325ce 100755 --- a/machines/mkdeps.perlmutter.gpu.sh +++ b/machines/mkdeps.perlmutter.gpu.sh @@ -1,7 +1,18 @@ +#.MF 2024/03/07: At the time we got this to work I had the following modules loaded +#. 1) craype-x86-milan 7) cpe/23.12 13) craype-accel-nvidia80 +#. 2) libfabric/1.15.2.0 8) gpu/1.0 14) cray-mpich/8.1.28 (mpi) +#. 3) craype-network-ofi 9) craype/2.7.30 (c) 15) cudatoolkit/12.0 (g) +#. 4) xpmem/2.6.2-2.5_2.38__gd067c3f.shasta 10) cray-dsmml/0.2.2 16) nccl/2.18.3-cu12 +#. 5) gcc-native/12.3 11) cray-libsci/23.12.5 (math) +#. 6) perftools-base/23.12.0 12) PrgEnv-gnu/8.5.0 (cpe) +#.Most of these are loaded by default, so we just load some extra/key ones here. +module load PrgEnv-gnu/8.5.0 +module load craype-accel-nvidia80 +module load cray-mpich/8.1.28 module load cudatoolkit/12.0 -module load openmpi/5.0.0rc12 module load nccl/2.18.3-cu12 -module unload darshan -cd install-deps + : "${PREFIX:=$HOME/gkylsoft}" + +cd install-deps ./mkdeps.sh --build-openblas=yes --build-superlu=yes --prefix=$PREFIX --build-openmpi=no MPICC=mpicc MPICXX=mpicxx