aws-samples · sean-smith · Oct 11, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024
diff --git a/3.test_cases/10.FSDP/0.create_conda_env.sh b/3.test_cases/10.FSDP/0.create_conda_env.sh
@@ -14,15 +14,7 @@ conda create -y -p ./pt_fsdp python=3.11
 
 source activate ./pt_fsdp/
 
-# Set true to install AWS Pytorch. see https://aws-pytorch-doc.com/
-use_aws_pytorch=true
-
-if $use_aws_pytorch; then
-    conda install -y pytorch=2.3.0 pytorch-cuda=12.1 aws-ofi-nccl torchvision torchaudio transformers datasets fsspec=2023.9.2 --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge
-else
-    conda install -y pytorch=2.4.1 torchvision torchaudio transformers datasets fsspec=2023.9.2 pytorch-cuda=12.1 -c pytorch -c nvidia
-    conda install -y aws-ofi-nccl=1.9.1 -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c conda-forge
-fi
+conda install -y pytorch=2.4.1 torchvision torchaudio transformers datasets fsspec=2023.9.2 pytorch-cuda=12.1 -c pytorch -c nvidia
 
 # Create checkpoint dir
 mkdir checkpoints
diff --git a/...ses/10.FSDP/1.distributed-training.sbatch → ...FSDP/1.distributed-training-llama2.sbatch b/...ses/10.FSDP/1.distributed-training.sbatch → ...FSDP/1.distributed-training-llama2.sbatch
@@ -20,12 +20,8 @@ GPUS_PER_NODE=8 # 4 for G5.12x, 8 for P4/P5
 ###########################
 
 ## Plenty of EFA level variables
-## Comment out for non-efa instances (G4d, P3)
-## For G5.12x, Comment out RDMA and Fork safe
 ## For G4dn and other G5, comment out all
-export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
-export FI_EFA_FORK_SAFE=1
-export FI_LOG_LEVEL=1
+export FI_LOG_LEVEL=warn
 export FI_PROVIDER=efa
 # export FI_EFA_USE_HUGE_PAGE=0    # Set to 0 when you see os.fork() causes OSError: Cannot allocate memory.  Disabling huge page causes minor performance hit.
 export NCCL_DEBUG=INFO

diff --git a/3.test_cases/10.FSDP/README.md b/3.test_cases/10.FSDP/README.md
@@ -36,24 +36,12 @@ If you'd like to instead use your own dataset, you can do so by [formatting it a
 
 ## 3. Launch Training
 
-- The script to launch a Llama 2 Slurm batch training job can be found in `1.distributed_training.sbatch`.
-- The script to launch a Mixtral training can be found in `2.distributed_training_mixtral.sbatch`.
+- The script to launch a Llama 2 Slurm batch training job can be found in `1.distributed-training-llama2.sbatch`.
+- The script to launch a Mixtral training can be found in `2.distributed-training-mixtral.sbatch`.
 - Th script to launch Mistral Mathstral training can be foudn in `3.distributed-training-mistral-mathstral.sbatch`.
 -  You can adjust the number of training nodes by modifying `#SBATCH --nodes=4` to match the size of your cluster.
 
-If you are using a non-RDMA enable instance, such as g5.12xlarge, comment out lines 21-22. These instances have EFA between nodes, but do not have the GPU direct RDMA access of P4d and P5 instances.
-
-```
-## Plenty of EFA level variables
-## Comment out for non-efa instances (G5, G4d, P3)
-# export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
-# export FI_EFA_FORK_SAFE=1
-export FI_LOG_LEVEL=1
-export FI_PROVIDER=efa
-export NCCL_DEBUG=INFO
-```
-
-If you are using non-EFA enabled instances, such as G4dn, or single GPU g5 nodes, comment out all EFA environment variables on lines 21-25.
+If you are using non-EFA enabled instances, such as G4dn, or single GPU g5 nodes, comment out all EFA environment variables on lines 24-25.
 
 Also, under `User Variables` make sure to adjust `GPUS_PER_NODE` to match the number of GPUs on your instance type (8 for P4d(e)/P5, 4 for G5.12xlarge, 1 for G5.xlarge).
 
@@ -127,9 +115,9 @@ declare -a TRAINING_ARGS=(
 To launch your training for Llama 2, run
 
 ```
-sbatch 1.distributed_training.sbatch 
+sbatch 1.distributed-training-llama2.sbatch 
 ```
-Similarly for Mixtral 8x7B and Mathstral, launch run `sbatch` with the `2.distributed_training_mixtral.sbatch` and the `3.distributed-training-mistral-mathstral.sbatch` files respectively.
+Similarly for Mixtral 8x7B and Mathstral, launch run `sbatch` with the `2.distributed-training-mixtral.sbatch` and the `3.distributed-training-mistral-mathstral.sbatch` files respectively.
 
 You'll find a new file in the FSDP directory of the form `slurm-[job-number].out`. This will be continuously updated with your training logs. Don't be worried if you see a long stream of NCCL logs (we prefer to use `NCCL_DEBUG=INFO` for verbose logging). After about a minute, you should see your model training, with an output similar to below for Llama2 :
 

diff --git a/3.test_cases/16.pytorch-cpu-ddp/slurm/0.create-conda-env.sh b/3.test_cases/16.pytorch-cpu-ddp/slurm/0.create-conda-env.sh
@@ -10,10 +10,8 @@ chmod +x Miniconda3-latest-Linux-x86_64.sh
 
 source ./miniconda3/bin/activate
 
-conda create -y -p ./pt_cpu python=3.10 --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge
+conda create -y -p ./pt_cpu python=3.10 pytorch=2.0.1 -c pytorch -c nvidia -c conda-forge
 
 source activate ./pt_cpu/
 
-conda install -y pytorch=2.0.1  --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge
-
 rm Miniconda3-latest-Linux-x86_64.sh*
diff --git a/3.test_cases/17.SM-modelparallelv2/README.md b/3.test_cases/17.SM-modelparallelv2/README.md
@@ -49,16 +49,6 @@ bash setup_conda_env.sh
 These scripts need to be put in a shared file system that can be accessed by all nodes, such as [FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html).
 We also recommend setting all paths for input data and checkpoints as shared directories using FSx for Lustre.
 
-### cuDNN Download for cuda11.8 and cuda12.1
-We recommend that you install cuDNN for your desired cuda version using from the [NVIDIA Developer page](https://developer.nvidia.com/cudnn).  Click on the link and:
-1. Make a developer account.
-2. Click on "Download cuDNN Library".
-3. Agree to the terms.
-4. Download the Local Installer for Linux x86_64 (Tar) for cuda11 or cuda12 (we will use version 8.9.5 in the example going forward).
-4. Move the tar file from your local machine to your cluster root directory. 
-
-
-
 ### User Guide
 1. **Launching a job with synthetic data on 8 nodes**
 

diff --git a/3.test_cases/17.SM-modelparallelv2/bin/aws-ofi-nccl-1.7.4-aws_0.tar.bz2 b/3.test_cases/17.SM-modelparallelv2/bin/aws-ofi-nccl-1.7.4-aws_0.tar.bz2
diff --git a/3.test_cases/17.SM-modelparallelv2/bin/hwloc-2.9.2-h2bc3f7f_0.tar.bz2 b/3.test_cases/17.SM-modelparallelv2/bin/hwloc-2.9.2-h2bc3f7f_0.tar.bz2
diff --git a/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh b/3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
@@ -16,32 +16,22 @@ source ./miniconda3/bin/activate
 
 export ENV_PATH=./miniconda3/envs/smpv2
 
-conda create -p ${ENV_PATH} python=3.10
+conda create -p ${ENV_PATH} -y python=3.10 -c conda-forge
 
 conda activate ${ENV_PATH}
 
-
-# Install OFI nccl 
-conda install "aws-ofi-nccl==1.7.4" packaging --override-channels \
-  -c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
-  -c pytorch -c numba/label/dev \
-  -c nvidia \
-  -c conda-forge \
-
-conda install -c conda-forge mkl=2023.1.0
-conda install "requests==2.28.2"
-conda install "filelock==3.9.0"
-conda install "sympy==1.12"
+conda install -p ${ENV_PATH} -y \
+    ./bin/aws-ofi-nccl-1.7.4-aws_0.tar.bz2 \
+    ./bin/hwloc-2.9.2-h2bc3f7f_0.tar.bz2
 
 # Install SMP V2 pytorch. We will install SMP with pytorch 2.2
-conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0" packaging --override-channels \
+conda install -p ${ENV_PATH} -y mkl=2023.1.0 "requests==2.28.2" "filelock==3.9.0" "sympy==1.12" \
+  pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0" packaging --override-channels \
   -c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
   -c pytorch -c numba/label/dev \
   -c pytorch-nightly -c nvidia -c conda-forge
 
-
 # Install dependencies of the script as below
-
 python -m pip install --no-cache-dir -U \
     "transformers==4.37.1" \
     "accelerate==0.28.0" \
@@ -60,7 +50,7 @@ pip install megatron-core==0.5.0
 
 pip uninstall -y ninja && pip install ninja
 
-MAX_JOBS=64 pip install flash-attn==2.3.3 --no-build-isolation
+MAX_JOBS=$(nproc) pip install flash-attn==2.3.3 --no-build-isolation
 
 # Install SMDDP
 
@@ -72,18 +62,20 @@ SMDDP_WHL="smdistributed_dataparallel-2.2.0-cp310-cp310-linux_x86_64.whl" \
 
 if [ $SMP_CUDA_VER == "11.8" ]; then
     # cuDNN installation for TransformerEngine installation for cuda11.8
-    tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
-        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
-        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
-        && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+    wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
+        && sudo rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && sudo cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && sudo cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
         && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
         && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
 else
     # cuDNN installation for TransformerEngine installation for cuda12.1
-    tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
-        && rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
-        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
-        && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
+    wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
+        && sudo rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
+        && sudo cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
+        && sudo cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
         && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
         && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
 fi

diff --git a/3.test_cases/20.FSDP-Mamba/0.create_conda.sh b/3.test_cases/20.FSDP-Mamba/0.create_conda.sh
@@ -18,10 +18,5 @@ source ./conda/bin/activate
 conda create -n mambapretrain python=3.10
 conda activate mambapretrain
 
-# Install pytorch and other dependencies
-#conda install -y pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8  --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forgea
-#pip install causal-conv1d>=1.2.0
-#pip install mamba-ssm
-
 conda install -y pytorch==2.3.0 pytorch-cuda=11.8 -c pytorch -c nvidia
 pip install -r requirements.txt