From 838960a5e2aadd9e2dfe28ace3de37a2a8d93efc Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Mon, 9 Dec 2024 23:07:52 -0800
Subject: [PATCH 1/6] Patch PT 2.4 Training SM DLC

---
 dlc_developer_config.toml                     | 112 ++----------------
 ...ockerfile.sagemaker.cpu.core_packages.json |   4 +-
 ...ockerfile.sagemaker.gpu.core_packages.json |   4 +-
 3 files changed, 15 insertions(+), 105 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index e54e9a26290b..212b285c205f 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -1,167 +1,77 @@
 [dev]
-# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
 partner_developer = ""
-# Please only set it to true if you are preparing an EI related PR
-# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
-# Please only set it to true if you are preparing a NEURON related PR
-# Do remember to revert it back to false before merging any PR (including NEURON dedicated PR)
 neuron_mode = false
-# Please only set it to true if you are preparing a NEURONX related PR
-# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR)
 neuronx_mode = false
-# Please only set it to true if you are preparing a GRAVITON related PR
-# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR)
 graviton_mode = false
-# Please only set it to true if you are preparing a ARM64 related PR
-# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
 arm64_mode = false
-# Please only set it to True if you are preparing a HABANA related PR
-# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
 habana_mode = false
-# Please only set it to True if you are preparing a HUGGINGFACE TRCOMP related PR
-# Do remember to revert it back to False before merging any PR (including HUGGINGFACE TRCOMP dedicated PR)
-# This mode is used to build TF 2.6 and PT1.11 DLC
 huggingface_trcomp_mode = false
-# Please only set it to True if you are preparing a TRCOMP related PR
-# Do remember to revert it back to False before merging any PR (including TRCOMP dedicated PR)
-# This mode is used to build PT1.12 and above DLC
 trcomp_mode = false
-# Set deep_canary_mode to true to simulate Deep Canary Test conditions on PR for all frameworks in the
-# build_frameworks list below. This will cause all image builds and non-deep-canary tests on the PR to be skipped,
-# regardless of whether they are enabled or disabled below.
-# Set graviton_mode/arm64_mode to true to run Deep Canaries on Graviton/ARM64 images.
-# Do remember to revert it back to false before merging any PR.
 deep_canary_mode = false
 
 [build]
-# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
-
-# By default we build both training and inference containers. Set true/false values to determine which to build.
+build_frameworks = [ "pytorch",]
 build_training = true
-build_inference = true
-
-# Set do_build to "false" to skip builds and test the latest image built by this PR
-# Note: at least one build is required to set do_build to "false"
+build_inference = false
 do_build = true
 
 [notify]
-### Notify on test failures
-### Off by default
 notify_test_failures = false
-  # Valid values: medium or high
-  notification_severity = "medium"
+notification_severity = "medium"
 
 [test]
-### On by default
 sanity_tests = true
 security_tests = true
-  safety_check_test = false
-  ecr_scan_allowlist_feature = false
+safety_check_test = false
+ecr_scan_allowlist_feature = false
 ecs_tests = true
 eks_tests = true
 ec2_tests = true
-# Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = false
-
-### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
-### default. If false, these types of tests will be skipped while other tests will run as usual.
-### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
-### Off by default (set to false)
-ec2_tests_on_heavy_instances = false
-
-### SM specific tests
-### On by default
+ec2_benchmark_tests = true
+ec2_tests_on_heavy_instances = true
 sagemaker_local_tests = true
-
-# run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
-# run efa sagemaker tests
-sagemaker_efa_tests = false
-# run release_candidate_integration tests
-sagemaker_rc_tests = false
-# run sagemaker benchmark tests
-sagemaker_benchmark_tests = false
-
-# SM remote EFA test instance type
+sagemaker_efa_tests = true
+sagemaker_rc_tests = true
+sagemaker_benchmark_tests = true
 sagemaker_remote_efa_instance_type = ""
-
-# Run CI tests for nightly images
-# false by default
 nightly_pr_test_mode = false
-
 use_scheduler = false
 
 [buildspec_override]
-# Assign the path to the required buildspec file from the deep-learning-containers folder
-# For example:
-# dlc-pr-tensorflow-2-habana-training = "habana/tensorflow/training/buildspec-2-10.yml"
-# dlc-pr-pytorch-inference = "pytorch/inference/buildspec-1-12.yml"
-# Setting the buildspec file path to "" allows the image builder to choose the default buildspec file.
-
-### TRAINING PR JOBS ###
-
-# Standard Framework Training
 dlc-pr-mxnet-training = ""
-dlc-pr-pytorch-training = ""
+dlc-pr-pytorch-training = "pytorch/training/buildspec-2-4-sm.yml"
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
-
-# HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""
 dlc-pr-huggingface-pytorch-training = ""
-
-# Training Compiler
 dlc-pr-huggingface-pytorch-trcomp-training = ""
 dlc-pr-huggingface-tensorflow-2-trcomp-training = ""
 dlc-pr-pytorch-trcomp-training = ""
-
-# Neuron Training
 dlc-pr-mxnet-neuron-training = ""
 dlc-pr-pytorch-neuron-training = ""
 dlc-pr-tensorflow-2-neuron-training = ""
-
-# Stability AI Training
 dlc-pr-stabilityai-pytorch-training = ""
-
-# Habana Training
 dlc-pr-pytorch-habana-training = ""
 dlc-pr-tensorflow-2-habana-training = ""
-
-### INFERENCE PR JOBS ###
-
-# Standard Framework Inference
 dlc-pr-mxnet-inference = ""
 dlc-pr-pytorch-inference = ""
 dlc-pr-tensorflow-2-inference = ""
 dlc-pr-autogluon-inference = ""
-
-# Neuron Inference
 dlc-pr-mxnet-neuron-inference = ""
 dlc-pr-pytorch-neuron-inference = ""
 dlc-pr-tensorflow-1-neuron-inference = ""
 dlc-pr-tensorflow-2-neuron-inference = ""
-
-# HuggingFace Inference
 dlc-pr-huggingface-tensorflow-inference = ""
 dlc-pr-huggingface-pytorch-inference = ""
 dlc-pr-huggingface-pytorch-neuron-inference = ""
-
-# Stability AI Inference
 dlc-pr-stabilityai-pytorch-inference = ""
-
-# Graviton Inference
 dlc-pr-mxnet-graviton-inference = ""
 dlc-pr-pytorch-graviton-inference = ""
 dlc-pr-tensorflow-2-graviton-inference = ""
-
-# ARM64 Inference
 dlc-pr-pytorch-arm64-inference = ""
 dlc-pr-tensorflow-2-arm64-inference = ""
-
-# EIA Inference
 dlc-pr-mxnet-eia-inference = ""
 dlc-pr-pytorch-eia-inference = ""
 dlc-pr-tensorflow-2-eia-inference = ""
diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json
index 550b7143779a..d01b3ecb877c 100644
--- a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json
+++ b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json
@@ -1,6 +1,6 @@
 {
   "accelerate": {
-    "version_specifier": "==1.1.1",
+    "version_specifier": "==1.2.0",
     "skip": "True"
   },
   "fastai": {
@@ -8,7 +8,7 @@
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.2.7",
+    "version_specifier": "==1.3.0",
     "skip": "True"
   },
   "torchaudio": {
diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json
index 85af35715996..5932b03226bf 100644
--- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json
+++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json
@@ -1,6 +1,6 @@
 {
   "accelerate": {
-    "version_specifier": "==1.1.1",
+    "version_specifier": "==1.2.0",
     "skip": "True"
   },
   "fastai": {
@@ -12,7 +12,7 @@
     "skip": "True"
   },
   "s3torchconnector": {
-    "version_specifier": "==1.2.7",
+    "version_specifier": "==1.3.0",
     "skip": "True"
   },
   "torchaudio": {

From dc04db017ae714c54a4c6c3f681bfc6bf0ab24af Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Tue, 10 Dec 2024 11:04:09 -0800
Subject: [PATCH 2/6] pin blis

---
 .../training/docker/2.4/py3/Dockerfile.cpu    |  88 ++++-----
 .../docker/2.4/py3/cu124/Dockerfile.gpu       | 174 +++++++++---------
 2 files changed, 131 insertions(+), 131 deletions(-)

diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
index dfc34bb46e7d..4de4feae40cc 100644
--- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
@@ -200,49 +200,49 @@ RUN rm -rf /root/.cache | true
 #                  |_|
 ########################################################
 
-FROM common AS ec2
-
-ARG PYTHON
-ARG TORCH_URL
-ARG TORCHVISION_URL
-ARG TORCHAUDIO_URL
-ARG TORCHTEXT_URL
-
-WORKDIR /
-
-# Install PyTorch
-RUN pip install --no-cache-dir -U \
-    ${TORCH_URL} \
-    ${TORCHVISION_URL} \
-    ${TORCHAUDIO_URL} \
-    ${TORCHTEXT_URL} \
-    torchtnt \
-    s3torchconnector \
-    fastai \
-    accelerate \
-    # pin numpy requirement for fastai dependency
-    # requires explicit declaration of spacy, thic, blis
-    spacy \
-    thinc \
-    blis \
-    "numpy<2" \
- && pip uninstall -y dataclasses
-
-RUN HOME_DIR=/root \
- && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
- && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
- && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
- && chmod +x /usr/local/bin/testOSSCompliance \
- && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
- && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
- && rm -rf ${HOME_DIR}/oss_compliance* \
- && rm -rf /tmp/tmp*
-
-# Removing the cache as it is needed for security verification
-RUN rm -rf /root/.cache | true
-
-# Starts framework
-CMD ["/bin/bash"]
+# FROM common AS ec2
+#
+# ARG PYTHON
+# ARG TORCH_URL
+# ARG TORCHVISION_URL
+# ARG TORCHAUDIO_URL
+# ARG TORCHTEXT_URL
+#
+# WORKDIR /
+#
+# # Install PyTorch
+# RUN pip install --no-cache-dir -U \
+#     ${TORCH_URL} \
+#     ${TORCHVISION_URL} \
+#     ${TORCHAUDIO_URL} \
+#     ${TORCHTEXT_URL} \
+#     torchtnt \
+#     s3torchconnector \
+#     fastai \
+#     accelerate \
+#     # pin numpy requirement for fastai dependency
+#     # requires explicit declaration of spacy, thic, blis
+#     spacy \
+#     thinc \
+#     blis \
+#     "numpy<2" \
+#  && pip uninstall -y dataclasses
+#
+# RUN HOME_DIR=/root \
+#  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+#  && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+#  && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+#  && chmod +x /usr/local/bin/testOSSCompliance \
+#  && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+#  && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+#  && rm -rf ${HOME_DIR}/oss_compliance* \
+#  && rm -rf /tmp/tmp*
+#
+# # Removing the cache as it is needed for security verification
+# RUN rm -rf /root/.cache | true
+#
+# # Starts framework
+# CMD ["/bin/bash"]
 
 #################################################################
 #  ____                   __  __       _
@@ -310,7 +310,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     # requires explicit declaration of spacy, thic, blis
     spacy \
     thinc \
-    blis \
+    "blis<1" \
     "numpy<2" \
  && /opt/conda/bin/mamba clean -afy
 
diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
index 9a96d06b509a..ef98e8a997bf 100644
--- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
+++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
@@ -283,92 +283,92 @@ RUN rm -rf /root/.cache | true
 #                  |_|
 ########################################################
 
-FROM common AS ec2
-
-ARG PYTHON
-ARG NCCL_VERSION
-ARG GDRCOPY_VERSION
-ARG APEX_VERSION
-ARG TORCH_URL
-ARG TORCHVISION_URL
-ARG TORCHAUDIO_URL
-ARG TORCHTEXT_URL
-
-WORKDIR /
-
-# Install PyTorch
-RUN pip install --no-cache-dir -U \
-    ${TORCH_URL} \
-    ${TORCHVISION_URL} \
-    ${TORCHAUDIO_URL} \
-    ${TORCHTEXT_URL} \
-    torchtnt \
-    triton \
-    s3torchconnector \
-    fastai \
-    accelerate \
-    # pin numpy requirement for fastai dependency
-    # requires explicit declaration of spacy, thic, blis
-    spacy \
-    thinc \
-    blis \
-    "numpy<2" \
- && pip uninstall dataclasses
-
-# Install GDRCopy which is a dependency of SM Distributed DataParallel binary
-# The test binaries requires cuda driver library which could be found in conda
-# So update the linker path to point to it to avoid -Lcuda not found
-RUN cd /tmp \
- && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
- && cd gdrcopy \
- && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
- && CUDA=${CUDA_HOME} make install \
- && rm -rf /tmp/gdrcopy
-
-# Install NCCL
-RUN cd /tmp \
- && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
- && cd nccl \
- && make -j64 src.build BUILDDIR=/usr/local \
- && rm -rf /tmp/nccl
-
-# Install Nvidia Apex (needs pytorch)
-RUN cd /tmp \
- && pip install --no-cache-dir packaging \
- && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \
- && cd apex \
- && pip install -v \
-                --disable-pip-version-check \
-                --no-cache-dir \
-                --no-build-isolation \
-                --config-settings "--build-option=--cpp_ext" \
-                --config-settings "--build-option=--cuda_ext" ./ \
- && rm -rf /tmp/apex
-
-# Install flash attn and NVIDIA transformer engine.
-# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
-ENV NVTE_FRAMEWORK=pytorch
-# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
-# Set MAX_JOBS=4 to avoid OOM issues in installation process
-RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation
-# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
-RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9
-
-RUN HOME_DIR=/root \
- && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
- && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
- && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
- && chmod +x /usr/local/bin/testOSSCompliance \
- && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
- && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
- && rm -rf ${HOME_DIR}/oss_compliance* \
- && rm -rf /tmp/tmp*
-
-# Removing the cache as it is needed for security verification
-RUN rm -rf /root/.cache | true
-
-# Starts framework
-CMD ["/bin/bash"]
+# FROM common AS ec2
+#
+# ARG PYTHON
+# ARG NCCL_VERSION
+# ARG GDRCOPY_VERSION
+# ARG APEX_VERSION
+# ARG TORCH_URL
+# ARG TORCHVISION_URL
+# ARG TORCHAUDIO_URL
+# ARG TORCHTEXT_URL
+#
+# WORKDIR /
+#
+# # Install PyTorch
+# RUN pip install --no-cache-dir -U \
+#     ${TORCH_URL} \
+#     ${TORCHVISION_URL} \
+#     ${TORCHAUDIO_URL} \
+#     ${TORCHTEXT_URL} \
+#     torchtnt \
+#     triton \
+#     s3torchconnector \
+#     fastai \
+#     accelerate \
+#     # pin numpy requirement for fastai dependency
+#     # requires explicit declaration of spacy, thic, blis
+#     spacy \
+#     thinc \
+#     blis \
+#     "numpy<2" \
+#  && pip uninstall dataclasses
+#
+# # Install GDRCopy which is a dependency of SM Distributed DataParallel binary
+# # The test binaries requires cuda driver library which could be found in conda
+# # So update the linker path to point to it to avoid -Lcuda not found
+# RUN cd /tmp \
+#  && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+#  && cd gdrcopy \
+#  && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+#  && CUDA=${CUDA_HOME} make install \
+#  && rm -rf /tmp/gdrcopy
+#
+# # Install NCCL
+# RUN cd /tmp \
+#  && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+#  && cd nccl \
+#  && make -j64 src.build BUILDDIR=/usr/local \
+#  && rm -rf /tmp/nccl
+#
+# # Install Nvidia Apex (needs pytorch)
+# RUN cd /tmp \
+#  && pip install --no-cache-dir packaging \
+#  && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \
+#  && cd apex \
+#  && pip install -v \
+#                 --disable-pip-version-check \
+#                 --no-cache-dir \
+#                 --no-build-isolation \
+#                 --config-settings "--build-option=--cpp_ext" \
+#                 --config-settings "--build-option=--cuda_ext" ./ \
+#  && rm -rf /tmp/apex
+#
+# # Install flash attn and NVIDIA transformer engine.
+# # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+# ENV NVTE_FRAMEWORK=pytorch
+# # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# # Set MAX_JOBS=4 to avoid OOM issues in installation process
+# RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation
+# # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+# RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9
+#
+# RUN HOME_DIR=/root \
+#  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+#  && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+#  && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+#  && chmod +x /usr/local/bin/testOSSCompliance \
+#  && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+#  && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+#  && rm -rf ${HOME_DIR}/oss_compliance* \
+#  && rm -rf /tmp/tmp*
+#
+# # Removing the cache as it is needed for security verification
+# RUN rm -rf /root/.cache | true
+#
+# # Starts framework
+# CMD ["/bin/bash"]
 
 #################################################################
 #  ____                   __  __       _
@@ -492,7 +492,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     # requires explicit declaration of spacy, thic, blis
     spacy \
     thinc \
-    blis \
+    "blis<1" \
     "numpy<2" \
  && /opt/conda/bin/mamba clean -afy
 

From e92d2bed0ac137d22c18a72ab0ce2f448d0ddcd6 Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Tue, 10 Dec 2024 14:02:51 -0800
Subject: [PATCH 3/6] build cpu pip install blis

---
 pytorch/training/buildspec-2-4-sm.yml         | 34 +++++++++----------
 .../training/docker/2.4/py3/Dockerfile.cpu    |  8 +++--
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml
index 1cd563d6cbef..a99fb53f3613 100644
--- a/pytorch/training/buildspec-2-4-sm.yml
+++ b/pytorch/training/buildspec-2-4-sm.yml
@@ -47,20 +47,20 @@ images:
     target: sagemaker
     context:
       <<: *TRAINING_CONTEXT
-  BuildSageMakerGPUPTTrainPy3DockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &PYTORCH_GPU_TRAINING_PY3 false
-    image_size_baseline: 21500
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py311
-    cuda_version: &CUDA_VERSION cu124
-    os_version: &OS_VERSION ubuntu22.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker"
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-                         *DEVICE_TYPE ]
-    target: sagemaker
-    context:
-      <<: *TRAINING_CONTEXT
+  # BuildSageMakerGPUPTTrainPy3DockerImage:
+  #   <<: *TRAINING_REPOSITORY
+  #   build: &PYTORCH_GPU_TRAINING_PY3 false
+  #   image_size_baseline: 21500
+  #   device_type: &DEVICE_TYPE gpu
+  #   python_version: &DOCKER_PYTHON_VERSION py3
+  #   tag_python_version: &TAG_PYTHON_VERSION py311
+  #   cuda_version: &CUDA_VERSION cu124
+  #   os_version: &OS_VERSION ubuntu22.04
+  #   tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+  #   # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+  #   # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker"
+  #   docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+  #                        *DEVICE_TYPE ]
+  #   target: sagemaker
+  #   context:
+  #     <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
index 4de4feae40cc..294f89f5f0e0 100644
--- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
@@ -306,13 +306,15 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     shap \
     # pinned for sagemaker==2.235.1
     "cloudpickle==2.2.1" \
+ && /opt/conda/bin/mamba clean -afy
+
+RUN pip install --no-cache-dir -U \
     # pin numpy requirement for sagemaker dependency
     # requires explicit declaration of spacy, thic, blis
     spacy \
     thinc \
-    "blis<1" \
-    "numpy<2" \
- && /opt/conda/bin/mamba clean -afy
+    blis \
+    "numpy<2"
 
 # Copy workaround script for incorrect hostname
 COPY changehostname.c /

From 4d69986b1302fb720f48e4ebf26bd3258cb0590c Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Tue, 10 Dec 2024 14:32:12 -0800
Subject: [PATCH 4/6] build test all

---
 pytorch/training/docker/2.4/py3/Dockerfile.cpu       | 1 +
 pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
index 294f89f5f0e0..365b4fd57123 100644
--- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
@@ -311,6 +311,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
 RUN pip install --no-cache-dir -U \
     # pin numpy requirement for sagemaker dependency
     # requires explicit declaration of spacy, thic, blis
+    # pip install due to pip check conflict with conda package
     spacy \
     thinc \
     blis \
diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
index ef98e8a997bf..dcd1b273451a 100644
--- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
+++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
@@ -488,13 +488,16 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
     seaborn \
     # pinned for sagemaker==2.235.1
     "cloudpickle==2.2.1" \
+ && /opt/conda/bin/mamba clean -afy
+
+RUN pip install --no-cache-dir -U \
     # pin numpy requirement for sagemaker dependency
     # requires explicit declaration of spacy, thic, blis
+    # pip install due to pip check conflict with conda package
     spacy \
     thinc \
-    "blis<1" \
-    "numpy<2" \
- && /opt/conda/bin/mamba clean -afy
+    blis \
+    "numpy<2"
 
 # Add SageMaker DataParallel to LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_SHORT_VERSION}/site-packages/smdistributed/dataparallel/lib:$LD_LIBRARY_PATH"

From e4d59d762f4bcc24352d5538bef7c175407f5180 Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Tue, 10 Dec 2024 15:15:34 -0800
Subject: [PATCH 5/6] build all

---
 pytorch/training/buildspec-2-4-sm.yml | 34 +++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml
index a99fb53f3613..1cd563d6cbef 100644
--- a/pytorch/training/buildspec-2-4-sm.yml
+++ b/pytorch/training/buildspec-2-4-sm.yml
@@ -47,20 +47,20 @@ images:
     target: sagemaker
     context:
       <<: *TRAINING_CONTEXT
-  # BuildSageMakerGPUPTTrainPy3DockerImage:
-  #   <<: *TRAINING_REPOSITORY
-  #   build: &PYTORCH_GPU_TRAINING_PY3 false
-  #   image_size_baseline: 21500
-  #   device_type: &DEVICE_TYPE gpu
-  #   python_version: &DOCKER_PYTHON_VERSION py3
-  #   tag_python_version: &TAG_PYTHON_VERSION py311
-  #   cuda_version: &CUDA_VERSION cu124
-  #   os_version: &OS_VERSION ubuntu22.04
-  #   tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-  #   # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-  #   # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker"
-  #   docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
-  #                        *DEVICE_TYPE ]
-  #   target: sagemaker
-  #   context:
-  #     <<: *TRAINING_CONTEXT
+  BuildSageMakerGPUPTTrainPy3DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 21500
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py311
+    cuda_version: &CUDA_VERSION cu124
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
+    # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
+                         *DEVICE_TYPE ]
+    target: sagemaker
+    context:
+      <<: *TRAINING_CONTEXT

From 44b1eba946cbab92e62d4ed046f96b183833117b Mon Sep 17 00:00:00 2001
From: sirutBuasai <sirutbuasai27@outlook.com>
Date: Wed, 11 Dec 2024 14:31:24 -0800
Subject: [PATCH 6/6] revert dockerfile and toml

---
 dlc_developer_config.toml                     | 112 ++++++++++--
 .../training/docker/2.4/py3/Dockerfile.cpu    |  86 ++++-----
 .../docker/2.4/py3/cu124/Dockerfile.gpu       | 172 +++++++++---------
 3 files changed, 230 insertions(+), 140 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 212b285c205f..e54e9a26290b 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -1,77 +1,167 @@
 [dev]
+# Set to "huggingface", for example, if you are a huggingface developer. Default is ""
 partner_developer = ""
+# Please only set it to true if you are preparing an EI related PR
+# Do remember to revert it back to false before merging any PR (including EI dedicated PR)
 ei_mode = false
+# Please only set it to true if you are preparing a NEURON related PR
+# Do remember to revert it back to false before merging any PR (including NEURON dedicated PR)
 neuron_mode = false
+# Please only set it to true if you are preparing a NEURONX related PR
+# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR)
 neuronx_mode = false
+# Please only set it to true if you are preparing a GRAVITON related PR
+# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR)
 graviton_mode = false
+# Please only set it to true if you are preparing a ARM64 related PR
+# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
 arm64_mode = false
+# Please only set it to True if you are preparing a HABANA related PR
+# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
 habana_mode = false
+# Please only set it to True if you are preparing a HUGGINGFACE TRCOMP related PR
+# Do remember to revert it back to False before merging any PR (including HUGGINGFACE TRCOMP dedicated PR)
+# This mode is used to build TF 2.6 and PT1.11 DLC
 huggingface_trcomp_mode = false
+# Please only set it to True if you are preparing a TRCOMP related PR
+# Do remember to revert it back to False before merging any PR (including TRCOMP dedicated PR)
+# This mode is used to build PT1.12 and above DLC
 trcomp_mode = false
+# Set deep_canary_mode to true to simulate Deep Canary Test conditions on PR for all frameworks in the
+# build_frameworks list below. This will cause all image builds and non-deep-canary tests on the PR to be skipped,
+# regardless of whether they are enabled or disabled below.
+# Set graviton_mode/arm64_mode to true to run Deep Canaries on Graviton/ARM64 images.
+# Do remember to revert it back to false before merging any PR.
 deep_canary_mode = false
 
 [build]
-build_frameworks = [ "pytorch",]
+# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
+# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"]
+build_frameworks = []
+
+# By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = false
+build_inference = true
+
+# Set do_build to "false" to skip builds and test the latest image built by this PR
+# Note: at least one build is required to set do_build to "false"
 do_build = true
 
 [notify]
+### Notify on test failures
+### Off by default
 notify_test_failures = false
-notification_severity = "medium"
+  # Valid values: medium or high
+  notification_severity = "medium"
 
 [test]
+### On by default
 sanity_tests = true
 security_tests = true
-safety_check_test = false
-ecr_scan_allowlist_feature = false
+  safety_check_test = false
+  ecr_scan_allowlist_feature = false
 ecs_tests = true
 eks_tests = true
 ec2_tests = true
-ec2_benchmark_tests = true
-ec2_tests_on_heavy_instances = true
+# Set it to true if you are preparing a Benchmark related PR
+ec2_benchmark_tests = false
+
+### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
+### default. If false, these types of tests will be skipped while other tests will run as usual.
+### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true.
+### Off by default (set to false)
+ec2_tests_on_heavy_instances = false
+
+### SM specific tests
+### On by default
 sagemaker_local_tests = true
+
+# run standard sagemaker remote tests from test/sagemaker_tests
 sagemaker_remote_tests = true
-sagemaker_efa_tests = true
-sagemaker_rc_tests = true
-sagemaker_benchmark_tests = true
+# run efa sagemaker tests
+sagemaker_efa_tests = false
+# run release_candidate_integration tests
+sagemaker_rc_tests = false
+# run sagemaker benchmark tests
+sagemaker_benchmark_tests = false
+
+# SM remote EFA test instance type
 sagemaker_remote_efa_instance_type = ""
+
+# Run CI tests for nightly images
+# false by default
 nightly_pr_test_mode = false
+
 use_scheduler = false
 
 [buildspec_override]
+# Assign the path to the required buildspec file from the deep-learning-containers folder
+# For example:
+# dlc-pr-tensorflow-2-habana-training = "habana/tensorflow/training/buildspec-2-10.yml"
+# dlc-pr-pytorch-inference = "pytorch/inference/buildspec-1-12.yml"
+# Setting the buildspec file path to "" allows the image builder to choose the default buildspec file.
+
+### TRAINING PR JOBS ###
+
+# Standard Framework Training
 dlc-pr-mxnet-training = ""
-dlc-pr-pytorch-training = "pytorch/training/buildspec-2-4-sm.yml"
+dlc-pr-pytorch-training = ""
 dlc-pr-tensorflow-2-training = ""
 dlc-pr-autogluon-training = ""
+
+# HuggingFace Training
 dlc-pr-huggingface-tensorflow-training = ""
 dlc-pr-huggingface-pytorch-training = ""
+
+# Training Compiler
 dlc-pr-huggingface-pytorch-trcomp-training = ""
 dlc-pr-huggingface-tensorflow-2-trcomp-training = ""
 dlc-pr-pytorch-trcomp-training = ""
+
+# Neuron Training
 dlc-pr-mxnet-neuron-training = ""
 dlc-pr-pytorch-neuron-training = ""
 dlc-pr-tensorflow-2-neuron-training = ""
+
+# Stability AI Training
 dlc-pr-stabilityai-pytorch-training = ""
+
+# Habana Training
 dlc-pr-pytorch-habana-training = ""
 dlc-pr-tensorflow-2-habana-training = ""
+
+### INFERENCE PR JOBS ###
+
+# Standard Framework Inference
 dlc-pr-mxnet-inference = ""
 dlc-pr-pytorch-inference = ""
 dlc-pr-tensorflow-2-inference = ""
 dlc-pr-autogluon-inference = ""
+
+# Neuron Inference
 dlc-pr-mxnet-neuron-inference = ""
 dlc-pr-pytorch-neuron-inference = ""
 dlc-pr-tensorflow-1-neuron-inference = ""
 dlc-pr-tensorflow-2-neuron-inference = ""
+
+# HuggingFace Inference
 dlc-pr-huggingface-tensorflow-inference = ""
 dlc-pr-huggingface-pytorch-inference = ""
 dlc-pr-huggingface-pytorch-neuron-inference = ""
+
+# Stability AI Inference
 dlc-pr-stabilityai-pytorch-inference = ""
+
+# Graviton Inference
 dlc-pr-mxnet-graviton-inference = ""
 dlc-pr-pytorch-graviton-inference = ""
 dlc-pr-tensorflow-2-graviton-inference = ""
+
+# ARM64 Inference
 dlc-pr-pytorch-arm64-inference = ""
 dlc-pr-tensorflow-2-arm64-inference = ""
+
+# EIA Inference
 dlc-pr-mxnet-eia-inference = ""
 dlc-pr-pytorch-eia-inference = ""
 dlc-pr-tensorflow-2-eia-inference = ""
diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
index 365b4fd57123..d94c486a0294 100644
--- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu
+++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu
@@ -200,49 +200,49 @@ RUN rm -rf /root/.cache | true
 #                  |_|
 ########################################################
 
-# FROM common AS ec2
-#
-# ARG PYTHON
-# ARG TORCH_URL
-# ARG TORCHVISION_URL
-# ARG TORCHAUDIO_URL
-# ARG TORCHTEXT_URL
-#
-# WORKDIR /
-#
-# # Install PyTorch
-# RUN pip install --no-cache-dir -U \
-#     ${TORCH_URL} \
-#     ${TORCHVISION_URL} \
-#     ${TORCHAUDIO_URL} \
-#     ${TORCHTEXT_URL} \
-#     torchtnt \
-#     s3torchconnector \
-#     fastai \
-#     accelerate \
-#     # pin numpy requirement for fastai dependency
-#     # requires explicit declaration of spacy, thic, blis
-#     spacy \
-#     thinc \
-#     blis \
-#     "numpy<2" \
-#  && pip uninstall -y dataclasses
-#
-# RUN HOME_DIR=/root \
-#  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
-#  && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
-#  && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
-#  && chmod +x /usr/local/bin/testOSSCompliance \
-#  && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
-#  && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
-#  && rm -rf ${HOME_DIR}/oss_compliance* \
-#  && rm -rf /tmp/tmp*
-#
-# # Removing the cache as it is needed for security verification
-# RUN rm -rf /root/.cache | true
-#
-# # Starts framework
-# CMD ["/bin/bash"]
+FROM common AS ec2
+
+ARG PYTHON
+ARG TORCH_URL
+ARG TORCHVISION_URL
+ARG TORCHAUDIO_URL
+ARG TORCHTEXT_URL
+
+WORKDIR /
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U \
+    ${TORCH_URL} \
+    ${TORCHVISION_URL} \
+    ${TORCHAUDIO_URL} \
+    ${TORCHTEXT_URL} \
+    torchtnt \
+    s3torchconnector \
+    fastai \
+    accelerate \
+    # pin numpy requirement for fastai dependency
+    # requires explicit declaration of spacy, thic, blis
+    spacy \
+    thinc \
+    blis \
+    "numpy<2" \
+ && pip uninstall -y dataclasses
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+# Starts framework
+CMD ["/bin/bash"]
 
 #################################################################
 #  ____                   __  __       _
diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
index dcd1b273451a..c28ee66c64bd 100644
--- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
+++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu
@@ -283,92 +283,92 @@ RUN rm -rf /root/.cache | true
 #                  |_|
 ########################################################
 
-# FROM common AS ec2
-#
-# ARG PYTHON
-# ARG NCCL_VERSION
-# ARG GDRCOPY_VERSION
-# ARG APEX_VERSION
-# ARG TORCH_URL
-# ARG TORCHVISION_URL
-# ARG TORCHAUDIO_URL
-# ARG TORCHTEXT_URL
-#
-# WORKDIR /
-#
-# # Install PyTorch
-# RUN pip install --no-cache-dir -U \
-#     ${TORCH_URL} \
-#     ${TORCHVISION_URL} \
-#     ${TORCHAUDIO_URL} \
-#     ${TORCHTEXT_URL} \
-#     torchtnt \
-#     triton \
-#     s3torchconnector \
-#     fastai \
-#     accelerate \
-#     # pin numpy requirement for fastai dependency
-#     # requires explicit declaration of spacy, thic, blis
-#     spacy \
-#     thinc \
-#     blis \
-#     "numpy<2" \
-#  && pip uninstall dataclasses
-#
-# # Install GDRCopy which is a dependency of SM Distributed DataParallel binary
-# # The test binaries requires cuda driver library which could be found in conda
-# # So update the linker path to point to it to avoid -Lcuda not found
-# RUN cd /tmp \
-#  && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
-#  && cd gdrcopy \
-#  && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
-#  && CUDA=${CUDA_HOME} make install \
-#  && rm -rf /tmp/gdrcopy
-#
-# # Install NCCL
-# RUN cd /tmp \
-#  && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
-#  && cd nccl \
-#  && make -j64 src.build BUILDDIR=/usr/local \
-#  && rm -rf /tmp/nccl
-#
-# # Install Nvidia Apex (needs pytorch)
-# RUN cd /tmp \
-#  && pip install --no-cache-dir packaging \
-#  && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \
-#  && cd apex \
-#  && pip install -v \
-#                 --disable-pip-version-check \
-#                 --no-cache-dir \
-#                 --no-build-isolation \
-#                 --config-settings "--build-option=--cpp_ext" \
-#                 --config-settings "--build-option=--cuda_ext" ./ \
-#  && rm -rf /tmp/apex
-#
-# # Install flash attn and NVIDIA transformer engine.
-# # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
-# ENV NVTE_FRAMEWORK=pytorch
-# # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
-# # Set MAX_JOBS=4 to avoid OOM issues in installation process
-# RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation
-# # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
-# RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9
-#
-# RUN HOME_DIR=/root \
-#  && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
-#  && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
-#  && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
-#  && chmod +x /usr/local/bin/testOSSCompliance \
-#  && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
-#  && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
-#  && rm -rf ${HOME_DIR}/oss_compliance* \
-#  && rm -rf /tmp/tmp*
-#
-# # Removing the cache as it is needed for security verification
-# RUN rm -rf /root/.cache | true
-#
-# # Starts framework
-# CMD ["/bin/bash"]
+FROM common AS ec2
+
+ARG PYTHON
+ARG NCCL_VERSION
+ARG GDRCOPY_VERSION
+ARG APEX_VERSION
+ARG TORCH_URL
+ARG TORCHVISION_URL
+ARG TORCHAUDIO_URL
+ARG TORCHTEXT_URL
+
+WORKDIR /
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U \
+    ${TORCH_URL} \
+    ${TORCHVISION_URL} \
+    ${TORCHAUDIO_URL} \
+    ${TORCHTEXT_URL} \
+    torchtnt \
+    triton \
+    s3torchconnector \
+    fastai \
+    accelerate \
+    # pin numpy requirement for fastai dependency
+    # requires explicit declaration of spacy, thic, blis
+    spacy \
+    thinc \
+    blis \
+    "numpy<2" \
+ && pip uninstall dataclasses
+
+# Install GDRCopy which is a dependency of SM Distributed DataParallel binary
+# The test binaries requires cuda driver library which could be found in conda
+# So update the linker path to point to it to avoid -Lcuda not found
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+ && cd gdrcopy \
+ && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+ && CUDA=${CUDA_HOME} make install \
+ && rm -rf /tmp/gdrcopy
+
+# Install NCCL
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+ && cd nccl \
+ && make -j64 src.build BUILDDIR=/usr/local \
+ && rm -rf /tmp/nccl
+
+# Install Nvidia Apex (needs pytorch)
+RUN cd /tmp \
+ && pip install --no-cache-dir packaging \
+ && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \
+ && cd apex \
+ && pip install -v \
+                --disable-pip-version-check \
+                --no-cache-dir \
+                --no-build-isolation \
+                --config-settings "--build-option=--cpp_ext" \
+                --config-settings "--build-option=--cuda_ext" ./ \
+ && rm -rf /tmp/apex
+
+# Install flash attn and NVIDIA transformer engine.
+# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9
+
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && rm -rf ${HOME_DIR}/oss_compliance* \
+ && rm -rf /tmp/tmp*
+
+# Removing the cache as it is needed for security verification
+RUN rm -rf /root/.cache | true
+
+# Starts framework
+CMD ["/bin/bash"]
 
 #################################################################
 #  ____                   __  __       _