From 838960a5e2aadd9e2dfe28ace3de37a2a8d93efc Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Mon, 9 Dec 2024 23:07:52 -0800 Subject: [PATCH 1/6] Patch PT 2.4 Training SM DLC --- dlc_developer_config.toml | 112 ++---------------- ...ockerfile.sagemaker.cpu.core_packages.json | 4 +- ...ockerfile.sagemaker.gpu.core_packages.json | 4 +- 3 files changed, 15 insertions(+), 105 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e54e9a26290b..212b285c205f 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -1,167 +1,77 @@ [dev] -# Set to "huggingface", for example, if you are a huggingface developer. Default is "" partner_developer = "" -# Please only set it to true if you are preparing an EI related PR -# Do remember to revert it back to false before merging any PR (including EI dedicated PR) ei_mode = false -# Please only set it to true if you are preparing a NEURON related PR -# Do remember to revert it back to false before merging any PR (including NEURON dedicated PR) neuron_mode = false -# Please only set it to true if you are preparing a NEURONX related PR -# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR) neuronx_mode = false -# Please only set it to true if you are preparing a GRAVITON related PR -# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR) graviton_mode = false -# Please only set it to true if you are preparing a ARM64 related PR -# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) arm64_mode = false -# Please only set it to True if you are preparing a HABANA related PR -# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false -# Please only set it to True if you are preparing a HUGGINGFACE TRCOMP related PR -# Do remember to revert it back to False before merging any PR (including HUGGINGFACE TRCOMP dedicated PR) -# This mode is used to build TF 2.6 and PT1.11 DLC huggingface_trcomp_mode = false -# Please only set it to True if you are preparing a TRCOMP related PR -# Do remember to revert it back to False before merging any PR (including TRCOMP dedicated PR) -# This mode is used to build PT1.12 and above DLC trcomp_mode = false -# Set deep_canary_mode to true to simulate Deep Canary Test conditions on PR for all frameworks in the -# build_frameworks list below. This will cause all image builds and non-deep-canary tests on the PR to be skipped, -# regardless of whether they are enabled or disabled below. -# Set graviton_mode/arm64_mode to true to run Deep Canaries on Graviton/ARM64 images. -# Do remember to revert it back to false before merging any PR. deep_canary_mode = false [build] -# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. -# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] - -# By default we build both training and inference containers. Set true/false values to determine which to build. +build_frameworks = [ "pytorch",] build_training = true -build_inference = true - -# Set do_build to "false" to skip builds and test the latest image built by this PR -# Note: at least one build is required to set do_build to "false" +build_inference = false do_build = true [notify] -### Notify on test failures -### Off by default notify_test_failures = false - # Valid values: medium or high - notification_severity = "medium" +notification_severity = "medium" [test] -### On by default sanity_tests = true security_tests = true - safety_check_test = false - ecr_scan_allowlist_feature = false +safety_check_test = false +ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true -# Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false - -### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by -### default. If false, these types of tests will be skipped while other tests will run as usual. -### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. -### Off by default (set to false) -ec2_tests_on_heavy_instances = false - -### SM specific tests -### On by default +ec2_benchmark_tests = true +ec2_tests_on_heavy_instances = true sagemaker_local_tests = true - -# run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true -# run efa sagemaker tests -sagemaker_efa_tests = false -# run release_candidate_integration tests -sagemaker_rc_tests = false -# run sagemaker benchmark tests -sagemaker_benchmark_tests = false - -# SM remote EFA test instance type +sagemaker_efa_tests = true +sagemaker_rc_tests = true +sagemaker_benchmark_tests = true sagemaker_remote_efa_instance_type = "" - -# Run CI tests for nightly images -# false by default nightly_pr_test_mode = false - use_scheduler = false [buildspec_override] -# Assign the path to the required buildspec file from the deep-learning-containers folder -# For example: -# dlc-pr-tensorflow-2-habana-training = "habana/tensorflow/training/buildspec-2-10.yml" -# dlc-pr-pytorch-inference = "pytorch/inference/buildspec-1-12.yml" -# Setting the buildspec file path to "" allows the image builder to choose the default buildspec file. - -### TRAINING PR JOBS ### - -# Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "" +dlc-pr-pytorch-training = "pytorch/training/buildspec-2-4-sm.yml" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" - -# HuggingFace Training dlc-pr-huggingface-tensorflow-training = "" dlc-pr-huggingface-pytorch-training = "" - -# Training Compiler dlc-pr-huggingface-pytorch-trcomp-training = "" dlc-pr-huggingface-tensorflow-2-trcomp-training = "" dlc-pr-pytorch-trcomp-training = "" - -# Neuron Training dlc-pr-mxnet-neuron-training = "" dlc-pr-pytorch-neuron-training = "" dlc-pr-tensorflow-2-neuron-training = "" - -# Stability AI Training dlc-pr-stabilityai-pytorch-training = "" - -# Habana Training dlc-pr-pytorch-habana-training = "" dlc-pr-tensorflow-2-habana-training = "" - -### INFERENCE PR JOBS ### - -# Standard Framework Inference dlc-pr-mxnet-inference = "" dlc-pr-pytorch-inference = "" dlc-pr-tensorflow-2-inference = "" dlc-pr-autogluon-inference = "" - -# Neuron Inference dlc-pr-mxnet-neuron-inference = "" dlc-pr-pytorch-neuron-inference = "" dlc-pr-tensorflow-1-neuron-inference = "" dlc-pr-tensorflow-2-neuron-inference = "" - -# HuggingFace Inference dlc-pr-huggingface-tensorflow-inference = "" dlc-pr-huggingface-pytorch-inference = "" dlc-pr-huggingface-pytorch-neuron-inference = "" - -# Stability AI Inference dlc-pr-stabilityai-pytorch-inference = "" - -# Graviton Inference dlc-pr-mxnet-graviton-inference = "" dlc-pr-pytorch-graviton-inference = "" dlc-pr-tensorflow-2-graviton-inference = "" - -# ARM64 Inference dlc-pr-pytorch-arm64-inference = "" dlc-pr-tensorflow-2-arm64-inference = "" - -# EIA Inference dlc-pr-mxnet-eia-inference = "" dlc-pr-pytorch-eia-inference = "" dlc-pr-tensorflow-2-eia-inference = "" diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json index 550b7143779a..d01b3ecb877c 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json +++ b/pytorch/training/docker/2.4/py3/Dockerfile.sagemaker.cpu.core_packages.json @@ -1,6 +1,6 @@ { "accelerate": { - "version_specifier": "==1.1.1", + "version_specifier": "==1.2.0", "skip": "True" }, "fastai": { @@ -8,7 +8,7 @@ "skip": "True" }, "s3torchconnector": { - "version_specifier": "==1.2.7", + "version_specifier": "==1.3.0", "skip": "True" }, "torchaudio": { diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json index 85af35715996..5932b03226bf 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.sagemaker.gpu.core_packages.json @@ -1,6 +1,6 @@ { "accelerate": { - "version_specifier": "==1.1.1", + "version_specifier": "==1.2.0", "skip": "True" }, "fastai": { @@ -12,7 +12,7 @@ "skip": "True" }, "s3torchconnector": { - "version_specifier": "==1.2.7", + "version_specifier": "==1.3.0", "skip": "True" }, "torchaudio": { From dc04db017ae714c54a4c6c3f681bfc6bf0ab24af Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Tue, 10 Dec 2024 11:04:09 -0800 Subject: [PATCH 2/6] pin blis --- .../training/docker/2.4/py3/Dockerfile.cpu | 88 ++++----- .../docker/2.4/py3/cu124/Dockerfile.gpu | 174 +++++++++--------- 2 files changed, 131 insertions(+), 131 deletions(-) diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu index dfc34bb46e7d..4de4feae40cc 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu @@ -200,49 +200,49 @@ RUN rm -rf /root/.cache | true # |_| ######################################################## -FROM common AS ec2 - -ARG PYTHON -ARG TORCH_URL -ARG TORCHVISION_URL -ARG TORCHAUDIO_URL -ARG TORCHTEXT_URL - -WORKDIR / - -# Install PyTorch -RUN pip install --no-cache-dir -U \ - ${TORCH_URL} \ - ${TORCHVISION_URL} \ - ${TORCHAUDIO_URL} \ - ${TORCHTEXT_URL} \ - torchtnt \ - s3torchconnector \ - fastai \ - accelerate \ - # pin numpy requirement for fastai dependency - # requires explicit declaration of spacy, thic, blis - spacy \ - thinc \ - blis \ - "numpy<2" \ - && pip uninstall -y dataclasses - -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - && rm -rf /tmp/tmp* - -# Removing the cache as it is needed for security verification -RUN rm -rf /root/.cache | true - -# Starts framework -CMD ["/bin/bash"] +# FROM common AS ec2 +# +# ARG PYTHON +# ARG TORCH_URL +# ARG TORCHVISION_URL +# ARG TORCHAUDIO_URL +# ARG TORCHTEXT_URL +# +# WORKDIR / +# +# # Install PyTorch +# RUN pip install --no-cache-dir -U \ +# ${TORCH_URL} \ +# ${TORCHVISION_URL} \ +# ${TORCHAUDIO_URL} \ +# ${TORCHTEXT_URL} \ +# torchtnt \ +# s3torchconnector \ +# fastai \ +# accelerate \ +# # pin numpy requirement for fastai dependency +# # requires explicit declaration of spacy, thic, blis +# spacy \ +# thinc \ +# blis \ +# "numpy<2" \ +# && pip uninstall -y dataclasses +# +# RUN HOME_DIR=/root \ +# && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ +# && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ +# && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ +# && chmod +x /usr/local/bin/testOSSCompliance \ +# && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ +# && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ +# && rm -rf ${HOME_DIR}/oss_compliance* \ +# && rm -rf /tmp/tmp* +# +# # Removing the cache as it is needed for security verification +# RUN rm -rf /root/.cache | true +# +# # Starts framework +# CMD ["/bin/bash"] ################################################################# # ____ __ __ _ @@ -310,7 +310,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ # requires explicit declaration of spacy, thic, blis spacy \ thinc \ - blis \ + "blis<1" \ "numpy<2" \ && /opt/conda/bin/mamba clean -afy diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu index 9a96d06b509a..ef98e8a997bf 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu @@ -283,92 +283,92 @@ RUN rm -rf /root/.cache | true # |_| ######################################################## -FROM common AS ec2 - -ARG PYTHON -ARG NCCL_VERSION -ARG GDRCOPY_VERSION -ARG APEX_VERSION -ARG TORCH_URL -ARG TORCHVISION_URL -ARG TORCHAUDIO_URL -ARG TORCHTEXT_URL - -WORKDIR / - -# Install PyTorch -RUN pip install --no-cache-dir -U \ - ${TORCH_URL} \ - ${TORCHVISION_URL} \ - ${TORCHAUDIO_URL} \ - ${TORCHTEXT_URL} \ - torchtnt \ - triton \ - s3torchconnector \ - fastai \ - accelerate \ - # pin numpy requirement for fastai dependency - # requires explicit declaration of spacy, thic, blis - spacy \ - thinc \ - blis \ - "numpy<2" \ - && pip uninstall dataclasses - -# Install GDRCopy which is a dependency of SM Distributed DataParallel binary -# The test binaries requires cuda driver library which could be found in conda -# So update the linker path to point to it to avoid -Lcuda not found -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ - && cd gdrcopy \ - && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ - && CUDA=${CUDA_HOME} make install \ - && rm -rf /tmp/gdrcopy - -# Install NCCL -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ - && cd nccl \ - && make -j64 src.build BUILDDIR=/usr/local \ - && rm -rf /tmp/nccl - -# Install Nvidia Apex (needs pytorch) -RUN cd /tmp \ - && pip install --no-cache-dir packaging \ - && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \ - && cd apex \ - && pip install -v \ - --disable-pip-version-check \ - --no-cache-dir \ - --no-build-isolation \ - --config-settings "--build-option=--cpp_ext" \ - --config-settings "--build-option=--cuda_ext" ./ \ - && rm -rf /tmp/apex - -# Install flash attn and NVIDIA transformer engine. -# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install -ENV NVTE_FRAMEWORK=pytorch -# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features -# Set MAX_JOBS=4 to avoid OOM issues in installation process -RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation -# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html -RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9 - -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - && rm -rf /tmp/tmp* - -# Removing the cache as it is needed for security verification -RUN rm -rf /root/.cache | true - -# Starts framework -CMD ["/bin/bash"] +# FROM common AS ec2 +# +# ARG PYTHON +# ARG NCCL_VERSION +# ARG GDRCOPY_VERSION +# ARG APEX_VERSION +# ARG TORCH_URL +# ARG TORCHVISION_URL +# ARG TORCHAUDIO_URL +# ARG TORCHTEXT_URL +# +# WORKDIR / +# +# # Install PyTorch +# RUN pip install --no-cache-dir -U \ +# ${TORCH_URL} \ +# ${TORCHVISION_URL} \ +# ${TORCHAUDIO_URL} \ +# ${TORCHTEXT_URL} \ +# torchtnt \ +# triton \ +# s3torchconnector \ +# fastai \ +# accelerate \ +# # pin numpy requirement for fastai dependency +# # requires explicit declaration of spacy, thic, blis +# spacy \ +# thinc \ +# blis \ +# "numpy<2" \ +# && pip uninstall dataclasses +# +# # Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# # The test binaries requires cuda driver library which could be found in conda +# # So update the linker path to point to it to avoid -Lcuda not found +# RUN cd /tmp \ +# && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ +# && cd gdrcopy \ +# && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ +# && CUDA=${CUDA_HOME} make install \ +# && rm -rf /tmp/gdrcopy +# +# # Install NCCL +# RUN cd /tmp \ +# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ +# && cd nccl \ +# && make -j64 src.build BUILDDIR=/usr/local \ +# && rm -rf /tmp/nccl +# +# # Install Nvidia Apex (needs pytorch) +# RUN cd /tmp \ +# && pip install --no-cache-dir packaging \ +# && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \ +# && cd apex \ +# && pip install -v \ +# --disable-pip-version-check \ +# --no-cache-dir \ +# --no-build-isolation \ +# --config-settings "--build-option=--cpp_ext" \ +# --config-settings "--build-option=--cuda_ext" ./ \ +# && rm -rf /tmp/apex +# +# # Install flash attn and NVIDIA transformer engine. +# # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +# ENV NVTE_FRAMEWORK=pytorch +# # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# # Set MAX_JOBS=4 to avoid OOM issues in installation process +# RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation +# # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +# RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9 +# +# RUN HOME_DIR=/root \ +# && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ +# && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ +# && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ +# && chmod +x /usr/local/bin/testOSSCompliance \ +# && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ +# && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ +# && rm -rf ${HOME_DIR}/oss_compliance* \ +# && rm -rf /tmp/tmp* +# +# # Removing the cache as it is needed for security verification +# RUN rm -rf /root/.cache | true +# +# # Starts framework +# CMD ["/bin/bash"] ################################################################# # ____ __ __ _ @@ -492,7 +492,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ # requires explicit declaration of spacy, thic, blis spacy \ thinc \ - blis \ + "blis<1" \ "numpy<2" \ && /opt/conda/bin/mamba clean -afy From e92d2bed0ac137d22c18a72ab0ce2f448d0ddcd6 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Tue, 10 Dec 2024 14:02:51 -0800 Subject: [PATCH 3/6] build cpu pip install blis --- pytorch/training/buildspec-2-4-sm.yml | 34 +++++++++---------- .../training/docker/2.4/py3/Dockerfile.cpu | 8 +++-- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml index 1cd563d6cbef..a99fb53f3613 100644 --- a/pytorch/training/buildspec-2-4-sm.yml +++ b/pytorch/training/buildspec-2-4-sm.yml @@ -47,20 +47,20 @@ images: target: sagemaker context: <<: *TRAINING_CONTEXT - BuildSageMakerGPUPTTrainPy3DockerImage: - <<: *TRAINING_REPOSITORY - build: &PYTORCH_GPU_TRAINING_PY3 false - image_size_baseline: 21500 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py311 - cuda_version: &CUDA_VERSION cu124 - os_version: &OS_VERSION ubuntu22.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., - *DEVICE_TYPE ] - target: sagemaker - context: - <<: *TRAINING_CONTEXT + # BuildSageMakerGPUPTTrainPy3DockerImage: + # <<: *TRAINING_REPOSITORY + # build: &PYTORCH_GPU_TRAINING_PY3 false + # image_size_baseline: 21500 + # device_type: &DEVICE_TYPE gpu + # python_version: &DOCKER_PYTHON_VERSION py3 + # tag_python_version: &TAG_PYTHON_VERSION py311 + # cuda_version: &CUDA_VERSION cu124 + # os_version: &OS_VERSION ubuntu22.04 + # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + # *DEVICE_TYPE ] + # target: sagemaker + # context: + # <<: *TRAINING_CONTEXT diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu index 4de4feae40cc..294f89f5f0e0 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu @@ -306,13 +306,15 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ shap \ # pinned for sagemaker==2.235.1 "cloudpickle==2.2.1" \ + && /opt/conda/bin/mamba clean -afy + +RUN pip install --no-cache-dir -U \ # pin numpy requirement for sagemaker dependency # requires explicit declaration of spacy, thic, blis spacy \ thinc \ - "blis<1" \ - "numpy<2" \ - && /opt/conda/bin/mamba clean -afy + blis \ + "numpy<2" # Copy workaround script for incorrect hostname COPY changehostname.c / From 4d69986b1302fb720f48e4ebf26bd3258cb0590c Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Tue, 10 Dec 2024 14:32:12 -0800 Subject: [PATCH 4/6] build test all --- pytorch/training/docker/2.4/py3/Dockerfile.cpu | 1 + pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu index 294f89f5f0e0..365b4fd57123 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu @@ -311,6 +311,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ RUN pip install --no-cache-dir -U \ # pin numpy requirement for sagemaker dependency # requires explicit declaration of spacy, thic, blis + # pip install due to pip check conflict with conda package spacy \ thinc \ blis \ diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu index ef98e8a997bf..dcd1b273451a 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu @@ -488,13 +488,16 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \ seaborn \ # pinned for sagemaker==2.235.1 "cloudpickle==2.2.1" \ + && /opt/conda/bin/mamba clean -afy + +RUN pip install --no-cache-dir -U \ # pin numpy requirement for sagemaker dependency # requires explicit declaration of spacy, thic, blis + # pip install due to pip check conflict with conda package spacy \ thinc \ - "blis<1" \ - "numpy<2" \ - && /opt/conda/bin/mamba clean -afy + blis \ + "numpy<2" # Add SageMaker DataParallel to LD_LIBRARY_PATH ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_SHORT_VERSION}/site-packages/smdistributed/dataparallel/lib:$LD_LIBRARY_PATH" From e4d59d762f4bcc24352d5538bef7c175407f5180 Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Tue, 10 Dec 2024 15:15:34 -0800 Subject: [PATCH 5/6] build all --- pytorch/training/buildspec-2-4-sm.yml | 34 +++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pytorch/training/buildspec-2-4-sm.yml b/pytorch/training/buildspec-2-4-sm.yml index a99fb53f3613..1cd563d6cbef 100644 --- a/pytorch/training/buildspec-2-4-sm.yml +++ b/pytorch/training/buildspec-2-4-sm.yml @@ -47,20 +47,20 @@ images: target: sagemaker context: <<: *TRAINING_CONTEXT - # BuildSageMakerGPUPTTrainPy3DockerImage: - # <<: *TRAINING_REPOSITORY - # build: &PYTORCH_GPU_TRAINING_PY3 false - # image_size_baseline: 21500 - # device_type: &DEVICE_TYPE gpu - # python_version: &DOCKER_PYTHON_VERSION py3 - # tag_python_version: &TAG_PYTHON_VERSION py311 - # cuda_version: &CUDA_VERSION cu124 - # os_version: &OS_VERSION ubuntu22.04 - # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] - # # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" - # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., - # *DEVICE_TYPE ] - # target: sagemaker - # context: - # <<: *TRAINING_CONTEXT + BuildSageMakerGPUPTTrainPy3DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 21500 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py311 + cuda_version: &CUDA_VERSION cu124 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ] + # build_tag_override: "beta:2.4.0-gpu-py311-cu124-ubuntu22.04-sagemaker" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., + *DEVICE_TYPE ] + target: sagemaker + context: + <<: *TRAINING_CONTEXT From 44b1eba946cbab92e62d4ed046f96b183833117b Mon Sep 17 00:00:00 2001 From: sirutBuasai Date: Wed, 11 Dec 2024 14:31:24 -0800 Subject: [PATCH 6/6] revert dockerfile and toml --- dlc_developer_config.toml | 112 ++++++++++-- .../training/docker/2.4/py3/Dockerfile.cpu | 86 ++++----- .../docker/2.4/py3/cu124/Dockerfile.gpu | 172 +++++++++--------- 3 files changed, 230 insertions(+), 140 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 212b285c205f..e54e9a26290b 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -1,77 +1,167 @@ [dev] +# Set to "huggingface", for example, if you are a huggingface developer. Default is "" partner_developer = "" +# Please only set it to true if you are preparing an EI related PR +# Do remember to revert it back to false before merging any PR (including EI dedicated PR) ei_mode = false +# Please only set it to true if you are preparing a NEURON related PR +# Do remember to revert it back to false before merging any PR (including NEURON dedicated PR) neuron_mode = false +# Please only set it to true if you are preparing a NEURONX related PR +# Do remember to revert it back to false before merging any PR (including NEURONX dedicated PR) neuronx_mode = false +# Please only set it to true if you are preparing a GRAVITON related PR +# Do remember to revert it back to false before merging any PR (including GRAVITON dedicated PR) graviton_mode = false +# Please only set it to true if you are preparing a ARM64 related PR +# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) arm64_mode = false +# Please only set it to True if you are preparing a HABANA related PR +# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false +# Please only set it to True if you are preparing a HUGGINGFACE TRCOMP related PR +# Do remember to revert it back to False before merging any PR (including HUGGINGFACE TRCOMP dedicated PR) +# This mode is used to build TF 2.6 and PT1.11 DLC huggingface_trcomp_mode = false +# Please only set it to True if you are preparing a TRCOMP related PR +# Do remember to revert it back to False before merging any PR (including TRCOMP dedicated PR) +# This mode is used to build PT1.12 and above DLC trcomp_mode = false +# Set deep_canary_mode to true to simulate Deep Canary Test conditions on PR for all frameworks in the +# build_frameworks list below. This will cause all image builds and non-deep-canary tests on the PR to be skipped, +# regardless of whether they are enabled or disabled below. +# Set graviton_mode/arm64_mode to true to run Deep Canaries on Graviton/ARM64 images. +# Do remember to revert it back to false before merging any PR. deep_canary_mode = false [build] -build_frameworks = [ "pytorch",] +# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. +# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] +build_frameworks = [] + +# By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true + +# Set do_build to "false" to skip builds and test the latest image built by this PR +# Note: at least one build is required to set do_build to "false" do_build = true [notify] +### Notify on test failures +### Off by default notify_test_failures = false -notification_severity = "medium" + # Valid values: medium or high + notification_severity = "medium" [test] +### On by default sanity_tests = true security_tests = true -safety_check_test = false -ecr_scan_allowlist_feature = false + safety_check_test = false + ecr_scan_allowlist_feature = false ecs_tests = true eks_tests = true ec2_tests = true -ec2_benchmark_tests = true -ec2_tests_on_heavy_instances = true +# Set it to true if you are preparing a Benchmark related PR +ec2_benchmark_tests = false + +### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by +### default. If false, these types of tests will be skipped while other tests will run as usual. +### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. +### Off by default (set to false) +ec2_tests_on_heavy_instances = false + +### SM specific tests +### On by default sagemaker_local_tests = true + +# run standard sagemaker remote tests from test/sagemaker_tests sagemaker_remote_tests = true -sagemaker_efa_tests = true -sagemaker_rc_tests = true -sagemaker_benchmark_tests = true +# run efa sagemaker tests +sagemaker_efa_tests = false +# run release_candidate_integration tests +sagemaker_rc_tests = false +# run sagemaker benchmark tests +sagemaker_benchmark_tests = false + +# SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" + +# Run CI tests for nightly images +# false by default nightly_pr_test_mode = false + use_scheduler = false [buildspec_override] +# Assign the path to the required buildspec file from the deep-learning-containers folder +# For example: +# dlc-pr-tensorflow-2-habana-training = "habana/tensorflow/training/buildspec-2-10.yml" +# dlc-pr-pytorch-inference = "pytorch/inference/buildspec-1-12.yml" +# Setting the buildspec file path to "" allows the image builder to choose the default buildspec file. + +### TRAINING PR JOBS ### + +# Standard Framework Training dlc-pr-mxnet-training = "" -dlc-pr-pytorch-training = "pytorch/training/buildspec-2-4-sm.yml" +dlc-pr-pytorch-training = "" dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" + +# HuggingFace Training dlc-pr-huggingface-tensorflow-training = "" dlc-pr-huggingface-pytorch-training = "" + +# Training Compiler dlc-pr-huggingface-pytorch-trcomp-training = "" dlc-pr-huggingface-tensorflow-2-trcomp-training = "" dlc-pr-pytorch-trcomp-training = "" + +# Neuron Training dlc-pr-mxnet-neuron-training = "" dlc-pr-pytorch-neuron-training = "" dlc-pr-tensorflow-2-neuron-training = "" + +# Stability AI Training dlc-pr-stabilityai-pytorch-training = "" + +# Habana Training dlc-pr-pytorch-habana-training = "" dlc-pr-tensorflow-2-habana-training = "" + +### INFERENCE PR JOBS ### + +# Standard Framework Inference dlc-pr-mxnet-inference = "" dlc-pr-pytorch-inference = "" dlc-pr-tensorflow-2-inference = "" dlc-pr-autogluon-inference = "" + +# Neuron Inference dlc-pr-mxnet-neuron-inference = "" dlc-pr-pytorch-neuron-inference = "" dlc-pr-tensorflow-1-neuron-inference = "" dlc-pr-tensorflow-2-neuron-inference = "" + +# HuggingFace Inference dlc-pr-huggingface-tensorflow-inference = "" dlc-pr-huggingface-pytorch-inference = "" dlc-pr-huggingface-pytorch-neuron-inference = "" + +# Stability AI Inference dlc-pr-stabilityai-pytorch-inference = "" + +# Graviton Inference dlc-pr-mxnet-graviton-inference = "" dlc-pr-pytorch-graviton-inference = "" dlc-pr-tensorflow-2-graviton-inference = "" + +# ARM64 Inference dlc-pr-pytorch-arm64-inference = "" dlc-pr-tensorflow-2-arm64-inference = "" + +# EIA Inference dlc-pr-mxnet-eia-inference = "" dlc-pr-pytorch-eia-inference = "" dlc-pr-tensorflow-2-eia-inference = "" diff --git a/pytorch/training/docker/2.4/py3/Dockerfile.cpu b/pytorch/training/docker/2.4/py3/Dockerfile.cpu index 365b4fd57123..d94c486a0294 100644 --- a/pytorch/training/docker/2.4/py3/Dockerfile.cpu +++ b/pytorch/training/docker/2.4/py3/Dockerfile.cpu @@ -200,49 +200,49 @@ RUN rm -rf /root/.cache | true # |_| ######################################################## -# FROM common AS ec2 -# -# ARG PYTHON -# ARG TORCH_URL -# ARG TORCHVISION_URL -# ARG TORCHAUDIO_URL -# ARG TORCHTEXT_URL -# -# WORKDIR / -# -# # Install PyTorch -# RUN pip install --no-cache-dir -U \ -# ${TORCH_URL} \ -# ${TORCHVISION_URL} \ -# ${TORCHAUDIO_URL} \ -# ${TORCHTEXT_URL} \ -# torchtnt \ -# s3torchconnector \ -# fastai \ -# accelerate \ -# # pin numpy requirement for fastai dependency -# # requires explicit declaration of spacy, thic, blis -# spacy \ -# thinc \ -# blis \ -# "numpy<2" \ -# && pip uninstall -y dataclasses -# -# RUN HOME_DIR=/root \ -# && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ -# && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ -# && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ -# && chmod +x /usr/local/bin/testOSSCompliance \ -# && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ -# && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ -# && rm -rf ${HOME_DIR}/oss_compliance* \ -# && rm -rf /tmp/tmp* -# -# # Removing the cache as it is needed for security verification -# RUN rm -rf /root/.cache | true -# -# # Starts framework -# CMD ["/bin/bash"] +FROM common AS ec2 + +ARG PYTHON +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall -y dataclasses + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +# Starts framework +CMD ["/bin/bash"] ################################################################# # ____ __ __ _ diff --git a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu index dcd1b273451a..c28ee66c64bd 100644 --- a/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu +++ b/pytorch/training/docker/2.4/py3/cu124/Dockerfile.gpu @@ -283,92 +283,92 @@ RUN rm -rf /root/.cache | true # |_| ######################################################## -# FROM common AS ec2 -# -# ARG PYTHON -# ARG NCCL_VERSION -# ARG GDRCOPY_VERSION -# ARG APEX_VERSION -# ARG TORCH_URL -# ARG TORCHVISION_URL -# ARG TORCHAUDIO_URL -# ARG TORCHTEXT_URL -# -# WORKDIR / -# -# # Install PyTorch -# RUN pip install --no-cache-dir -U \ -# ${TORCH_URL} \ -# ${TORCHVISION_URL} \ -# ${TORCHAUDIO_URL} \ -# ${TORCHTEXT_URL} \ -# torchtnt \ -# triton \ -# s3torchconnector \ -# fastai \ -# accelerate \ -# # pin numpy requirement for fastai dependency -# # requires explicit declaration of spacy, thic, blis -# spacy \ -# thinc \ -# blis \ -# "numpy<2" \ -# && pip uninstall dataclasses -# -# # Install GDRCopy which is a dependency of SM Distributed DataParallel binary -# # The test binaries requires cuda driver library which could be found in conda -# # So update the linker path to point to it to avoid -Lcuda not found -# RUN cd /tmp \ -# && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ -# && cd gdrcopy \ -# && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ -# && CUDA=${CUDA_HOME} make install \ -# && rm -rf /tmp/gdrcopy -# -# # Install NCCL -# RUN cd /tmp \ -# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ -# && cd nccl \ -# && make -j64 src.build BUILDDIR=/usr/local \ -# && rm -rf /tmp/nccl -# -# # Install Nvidia Apex (needs pytorch) -# RUN cd /tmp \ -# && pip install --no-cache-dir packaging \ -# && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \ -# && cd apex \ -# && pip install -v \ -# --disable-pip-version-check \ -# --no-cache-dir \ -# --no-build-isolation \ -# --config-settings "--build-option=--cpp_ext" \ -# --config-settings "--build-option=--cuda_ext" ./ \ -# && rm -rf /tmp/apex -# -# # Install flash attn and NVIDIA transformer engine. -# # Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install -# ENV NVTE_FRAMEWORK=pytorch -# # Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features -# # Set MAX_JOBS=4 to avoid OOM issues in installation process -# RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation -# # Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html -# RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9 -# -# RUN HOME_DIR=/root \ -# && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ -# && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ -# && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ -# && chmod +x /usr/local/bin/testOSSCompliance \ -# && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ -# && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ -# && rm -rf ${HOME_DIR}/oss_compliance* \ -# && rm -rf /tmp/tmp* -# -# # Removing the cache as it is needed for security verification -# RUN rm -rf /root/.cache | true -# -# # Starts framework -# CMD ["/bin/bash"] +FROM common AS ec2 + +ARG PYTHON +ARG NCCL_VERSION +ARG GDRCOPY_VERSION +ARG APEX_VERSION +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL + +WORKDIR / + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + torchtnt \ + triton \ + s3torchconnector \ + fastai \ + accelerate \ + # pin numpy requirement for fastai dependency + # requires explicit declaration of spacy, thic, blis + spacy \ + thinc \ + blis \ + "numpy<2" \ + && pip uninstall dataclasses + +# Install GDRCopy which is a dependency of SM Distributed DataParallel binary +# The test binaries requires cuda driver library which could be found in conda +# So update the linker path to point to it to avoid -Lcuda not found +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ + && cd gdrcopy \ + && sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ + && CUDA=${CUDA_HOME} make install \ + && rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl + +# Install Nvidia Apex (needs pytorch) +RUN cd /tmp \ + && pip install --no-cache-dir packaging \ + && git clone https://github.com/NVIDIA/apex -b ${APEX_VERSION} \ + && cd apex \ + && pip install -v \ + --disable-pip-version-check \ + --no-cache-dir \ + --no-build-isolation \ + --config-settings "--build-option=--cpp_ext" \ + --config-settings "--build-option=--cuda_ext" ./ \ + && rm -rf /tmp/apex + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==2.4.2 --no-build-isolation +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v1.9 + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Removing the cache as it is needed for security verification +RUN rm -rf /root/.cache | true + +# Starts framework +CMD ["/bin/bash"] ################################################################# # ____ __ __ _