From fbf01758b918379ea874058f5ec29e94a0e53326 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 28 Jan 2025 11:34:41 +0000 Subject: [PATCH] 2025-01-28 nightly release (52b07493bbb981e94ad45516d917792ff0cb86fd) --- .github/scripts/validate_binaries.sh | 66 ++++++++++++++++++---------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 85ad0de47..f9958a208 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -7,8 +7,9 @@ export PYTORCH_CUDA_PKG="" +export CONDA_ENV="build_binary" -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" conda run -n build_binary python --version @@ -49,41 +50,58 @@ elif [[ ${MATRIX_CHANNEL} = 'release' ]]; then export PYTORCH_URL="https://download.pytorch.org/whl/${CUDA_VERSION}" fi + +echo "CU_VERSION: ${CUDA_VERSION}" +echo "MATRIX_CHANNEL: ${MATRIX_CHANNEL}" +echo "CONDA_ENV: ${CONDA_ENV}" + +# shellcheck disable=SC2155 +export CONDA_PREFIX=$(conda run -n "${CONDA_ENV}" printenv CONDA_PREFIX) + + +# Set LD_LIBRARY_PATH to fix the runtime error with fbgemm_gpu not +# being able to locate libnvrtc.so +# NOTE: The order of the entries in LD_LIBRARY_PATH matters +echo "[NOVA] Setting LD_LIBRARY_PATH ..." +conda env config vars set -n ${CONDA_ENV} \ + LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:/usr/local/lib:/usr/lib64:${LD_LIBRARY_PATH}" + + # install pytorch # switch back to conda once torch nightly is fixed # if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then # export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" # fi -conda run -n build_binary pip install torch --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL" # install fbgemm -conda run -n build_binary pip install fbgemm-gpu --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL" # install requirements from pypi -conda run -n build_binary pip install torchmetrics==1.0.3 +conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3 # install torchrec -conda run -n build_binary pip install torchrec --index-url "$PYTORCH_URL" +conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL" # Run small import test -conda run -n build_binary python -c "import torch; import fbgemm_gpu; import torchrec" +conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec" # check directory ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # Finally run smoke test # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath if [[ ${MATRIX_GPU_ARCH_TYPE} = 'cuda' ]]; then - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py else - conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only + conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --script test_installation.py -- --cpu_only fi @@ -93,8 +111,8 @@ if [[ ${MATRIX_CHANNEL} != 'release' ]]; then exit 0 else # Check version matches only for release binaries - torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) - fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) + torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) + fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -102,22 +120,22 @@ else fi fi -conda create -y -n build_binary python="${MATRIX_PYTHON_VERSION}" +conda create -y -n "${CONDA_ENV}" python="${MATRIX_PYTHON_VERSION}" -conda run -n build_binary python --version +conda run -n "${CONDA_ENV}" python --version if [[ ${MATRIX_GPU_ARCH_VERSION} != '12.4' ]]; then exit 0 fi echo "checking pypi release" -conda run -n build_binary pip install torch -conda run -n build_binary pip install fbgemm-gpu -conda run -n build_binary pip install torchrec +conda run -n "${CONDA_ENV}" pip install torch +conda run -n "${CONDA_ENV}" pip install fbgemm-gpu +conda run -n "${CONDA_ENV}" pip install torchrec # Check version matching again for PyPI -torchrec_version=$(conda run -n build_binary pip show torchrec | grep Version | cut -d' ' -f2) -fbgemm_version=$(conda run -n build_binary pip show fbgemm_gpu | grep Version | cut -d' ' -f2) +torchrec_version=$(conda run -n "${CONDA_ENV}" pip show torchrec | grep Version | cut -d' ' -f2) +fbgemm_version=$(conda run -n "${CONDA_ENV}" pip show fbgemm_gpu | grep Version | cut -d' ' -f2) if [ "$torchrec_version" != "$fbgemm_version" ]; then echo "Error: TorchRec package version does not match FBGEMM package version" @@ -128,13 +146,13 @@ fi ls -R # check if cuda available -conda run -n build_binary python -c "import torch; print(torch.cuda.is_available())" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.cuda.is_available())" # check cuda version -conda run -n build_binary python -c "import torch; print(torch.version.cuda)" +conda run -n "${CONDA_ENV}" python -c "import torch; print(torch.version.cuda)" # python 3.11 needs torchx-nightly -conda run -n build_binary pip install torchx-nightly iopath +conda run -n "${CONDA_ENV}" pip install torchx-nightly iopath # Finally run smoke test -conda run -n build_binary torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py +conda run -n "${CONDA_ENV}" torchx run -s local_cwd dist.ddp -j 1 --gpu 2 --script test_installation.py