-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy path1.param-benchmark-comms.sbatch
43 lines (35 loc) · 1.46 KB
/
1.param-benchmark-comms.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/bin/bash
#SBATCH --exclusive # the job has exclusive use of the instances it uses
#SBATCH --gres=gpu:8 # reserve 8 GPU resources / instance / node
#SBATCH --gpus-per-node=8 #
#SBATCH --nodes=2 # how many nodes, you can override on the CLI
#SBATCH --wait-all-nodes=1 # wait for all nodes before running the job
#SBATCH --job-name=param_benchmark # name of your job
#SBATCH --output=%x_%j.out # declare output, merge both stdout and stderr
set -ex;
###########################
###### User Variables #####
###########################
# default variables for Enroot
: "${APPS_PATH:=/apps}"
: "${NCCL_TESTS_PATH:=/home/ec2-user/}"
: "${IMAGE:=/apps/param-benchmark.sqsh}"
## Plenty of EFA level variables
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
export FI_EFA_FORK_SAFE=1
# export NCCL_ALGO=Ring
export FI_LOG_LEVEL=1
export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons
export FI_EFA_ENABLE_SHM_TRANSFER=1
# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
# https://github.com/pytorch/pytorch/issues/68893
#export NCCL_SOCKET_IFNAME=ens
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_DEBUG=INFO
srun --container-image=$IMAGE -l torchrun \
--nproc_per_node $SLURM_GPUS_PER_NODE \
--nnodes $SLURM_JOB_NUM_NODES \
--rdzv_id $SLURM_JOB_ID \
--rdzv_backend c10d \
--rdzv_endpoint $(hostname):0 \
/param/train/comms/pt/comms.py --b=8 --e=2GB --f=2 --collective=all_reduce --num_iters=100