-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3a3881a
commit b04369a
Showing
2 changed files
with
116 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=t2i_testing | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! | ||
#SBATCH --cpus-per-task=48 | ||
#SBATCH --gres=gpu:4 | ||
#SBATCH --exclusive | ||
#SBATCH -A cstdl | ||
#SBATCH --partition booster | ||
#SBATCH --output=/p/home/jusers/isozaki1/juwels/%x-%j.out | ||
#SBATCH --time=0:10:00 | ||
|
||
set -x -e | ||
|
||
source /p/home/jusers/isozaki1/juwels/miniconda3/etc/profile.d/conda.sh | ||
conda activate muse | ||
|
||
echo "START TIME: $(date)" | ||
|
||
MUSE_REPO=/p/home/jusers/isozaki1/juwels/open-muse | ||
OUTPUT_DIR=/p/home/jusers/isozaki1/juwels/muse | ||
LOG_PATH=$OUTPUT_DIR/main_log.txt | ||
|
||
mkdir -p $OUTPUT_DIR | ||
touch $LOG_PATH | ||
pushd $MUSE_REPO | ||
|
||
GPUS_PER_NODE=4 | ||
NNODES=$SLURM_NNODES | ||
|
||
CMD=" \ | ||
training/train_muse.py config=configs/imagenet_text2image_jewels.yaml \ | ||
wandb.entity=isamu \ | ||
experiment.name=$(basename $OUTPUT_DIR) \ | ||
experiment.output_dir=$OUTPUT_DIR \ | ||
training.seed=9345104 \ | ||
experiment.num_nodes=$SLURM_NNODES | ||
" | ||
|
||
# so processes know who to talk to | ||
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | ||
MASTER_PORT=6000 | ||
|
||
export LAUNCHER="python -u -m torch.distributed.run \ | ||
--nproc_per_node $GPUS_PER_NODE \ | ||
--nnodes $NNODES \ | ||
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ | ||
--rdzv_backend c10d \ | ||
--max_restarts 0 \ | ||
--tee 3 \ | ||
" | ||
|
||
echo $CMD | ||
|
||
# hide duplicated errors using this hack - will be properly fixed in pt-1.12 | ||
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json | ||
|
||
# force crashing on nccl issues like hanging broadcast | ||
# export NCCL_ASYNC_ERROR_HANDLING=1 | ||
# export NCCL_DEBUG=INFO | ||
# export NCCL_DEBUG_SUBSYS=COLL | ||
# export NCCL_SOCKET_NTHREADS=1 | ||
# export NCCL_NSOCKS_PERTHREAD=1 | ||
# export CUDA_LAUNCH_BLOCKING=1 | ||
|
||
# # AWS specific | ||
# export NCCL_PROTO=simple | ||
# export RDMAV_FORK_SAFE=1 | ||
# export FI_EFA_FORK_SAFE=1 | ||
# export FI_EFA_USE_DEVICE_RDMA=1 | ||
# export FI_PROVIDER=efa | ||
# export FI_LOG_LEVEL=1 | ||
# export NCCL_IB_DISABLE=1 | ||
# # # export NCCL_SOCKET_IFNAME=ens | ||
# export PYTHONWARNINGS="ignore" | ||
# export CXX=g++ | ||
|
||
|
||
# srun error handling: | ||
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks | ||
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code | ||
SRUN_ARGS=" \ | ||
--wait=60 \ | ||
--kill-on-bad-exit=1 \ | ||
" | ||
|
||
# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD | ||
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH | ||
|
||
echo "END TIME: $(date)" |