Skip to content

Commit

Permalink
[CI] Extract torch build as a standalone job (#1271)
Browse files Browse the repository at this point in the history
Extract PyTorch build as a standalone job for CI/CD.
Add PyTorch build fallback mechanism to last known good commit tracked
in #1280, and add a comment for new build failure.

1. created issue #1280 to track latest workable pytorch main commit 
2. ci test always pull latest pytorch main with torch-xpu-ops PR branch
to build, success goto 3, failed goto 4
3. update commit in 1, continue to do ci tests
4. add comment on issue #1280 with issued pytorch commit, and read the
commit in 1 and rebuild the pytorch, continue to do ci tests
  • Loading branch information
chuanqi129 authored Jan 14, 2025
1 parent 69ff73f commit 299831d
Show file tree
Hide file tree
Showing 5 changed files with 356 additions and 57 deletions.
181 changes: 181 additions & 0 deletions .github/workflows/_linux_build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
name: Linux PyTorch XPU Build

on:
workflow_call:
inputs:
pytorch:
required: false
type: string
default: 'main'
description: Pytorch branch/commit
keep_torch_xpu_ops:
required: false
type: string
default: 'false'
description: Keep torch-xpu-ops pin. `true` means use pined commit
abi:
required: false
type: string
default: 1
description: ABI version. Default abi as 1.
python:
required: false
type: string
default: '3.10'
description: Python version
runner:
required: true
type: string
default: 'linux.idc.xpu'
description: Runner label
driver:
required: false
type: string
default: 'lts'
description: Driver lts/rolling
outputs:
whl_name:
description: The name of the wheel file
value: ${{ jobs.Torch-XPU-Build.outputs.whl_name }}
torch_commit_id:
description: The commit id of the torch build
value: ${{ jobs.Torch-XPU-Build.outputs.TORCH_COMMIT_ID }}

permissions:
issues: write

jobs:
build:
if: ${{ inputs.pytorch }} != 'nightly_wheel'
runs-on: ${{ inputs.runner }}
outputs:
TORCH_COMMIT_ID: ${{ steps.build_version.outputs.TORCH_COMMIT_ID }}
timeout-minutes: 900
env:
commit_issue: 1280
GH_TOKEN: ${{ github.token }}
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Stock Pytorch
run: |
pwd
which conda && conda clean -ay
conda remove --all -y -n xpu_build || \
rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_build
conda create -n xpu_build python=${{ inputs.python }} cmake ninja -y
source activate xpu_build
cd ../ && rm -rf pytorch
pip install requests
git clone https://github.com/pytorch/pytorch pytorch
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
cd pytorch && git checkout $(echo ${{ inputs.pytorch }})
# apply PRs for stock pytorch
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
git submodule sync && git submodule update --init --recursive
if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
echo "Don't replace torch-xpu-ops!"
else
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
# Workaround for torch-xpu-ops ci test
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
fi
fi
- name: Build Pytorch XPU
run: |
source activate xpu_build
source .github/scripts/env.sh ${{ inputs.pytorch }}
pip install mkl-static==2025.0.1 mkl-include==2025.0.1
if [[ ${{ inputs.abi }} == '0' ]]; then
export _GLIBCXX_USE_CXX11_ABI=0
else
export _GLIBCXX_USE_CXX11_ABI=1
fi
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
repo="${{ github.repository }}"
last_commit=$(gh --repo $repo issue view $commit_issue --json body -q .body | grep ${{ inputs.pytorch }} | cut -d'[' -f 2 | cut -d']' -f 1)
cd ../pytorch
current_commit=$(git rev-parse HEAD)
echo ">>>>>>>>>>>>branch: ${{ inputs.pytorch }}, last commit: ${last_commit}, current commit: ${current_commit}"
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
pip install -r requirements.txt
WERROR=1 python setup.py bdist_wheel 2>&1 | tee pytorch_${current_commit}_build.log
if [ -f dist/torch*.whl ]; then
echo "Wheel build successful, update last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
gh --repo $repo issue view $commit_issue --json body -q .body | sed "s;${last_commit};${current_commit};" > new_body.txt
gh --repo $repo issue edit $commit_issue --body-file new_body.txt
else
echo "Wheel build failed, use last commit in the issue https://github.com/intel/torch-xpu-ops/issues/1280"
gh --repo $repo issue comment $commit_issue -b "Wheel build failed with commit [${current_commit}](https://github.com/pytorch/pytorch/tree/${current_commit}), refer ${build_url}. CC @intel/torch-xpu-ops-maintain @EikanWang @riverliuintel @fengyuan14 @xytintel @etaf @chuanqi129 @mengfei25"
git clean -df .
git checkout $last_commit
# apply PRs for stock pytorch
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
git submodule sync && git submodule update --init --recursive
if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
echo "Don't replace torch-xpu-ops!"
else
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
# Workaround for torch-xpu-ops ci test
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
fi
WERROR=1 python setup.py bdist_wheel
fi
pip install --force-reinstall dist/*.whl
cp dist/*.whl ${{ github.workspace }}/
cp pytorch_${current_commit}_build.log ${{ github.workspace }}/
else
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
cd ../pytorch
git reset --hard && git checkout ${TORCH_COMMIT_ID}
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
rm -rf third_party/torch-xpu-ops
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
cd third_party/torch-xpu-ops
git checkout ${TORCH_XPU_OPS_COMMIT}
cd ../..
fi
- name: Torch Config
run: |
source activate xpu_build
source .github/scripts/env.sh ${{ inputs.pytorch }}
python -c "import torch; print(torch.__config__.show())"
python -c "import torch; print(torch.__config__.parallel_info())"
python -c "import torch; print(torch.__config__.torch.xpu.device_count())"
cd ..
python pytorch/torch/utils/collect_env.py
- name: Identify Build version
id: build_version
run: |
source .github/scripts/env.sh
cd ../pytorch
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo ${GITHUB_ENV}
- name: Upload Torch XPU Wheel
if: always()
uses: actions/upload-artifact@v4
with:
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
path: ${{ github.workspace }}/torch*.whl
- name: Upload Build Log
if: always()
uses: actions/upload-artifact@v4
with:
name: Torch-XPU-Build-Log-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
path: ${{ github.workspace }}/pytorch_*.log
13 changes: 9 additions & 4 deletions .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ on:
permissions: read-all

jobs:
Torch-XPU-UT-Tests:
ut_test:
runs-on: ${{ inputs.runner }}
timeout-minutes: 900
env:
Expand Down Expand Up @@ -95,7 +95,13 @@ jobs:
if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
fi
- name: Build Pytorch XPU
- name: Download Pytorch wheel
if: ${{ inputs.pytorch }} != 'nightly_wheel'
uses: actions/download-artifact@v4
with:
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
path: ${{ github.workspace }}
- name: Install Pytorch XPU
run: |
source activate xpu_op_${ZE_AFFINITY_MASK}
source .github/scripts/env.sh ${{ inputs.pytorch }}
Expand All @@ -109,8 +115,7 @@ jobs:
cd ../pytorch
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
pip install -r requirements.txt
WERROR=1 python setup.py bdist_wheel
pip install --force-reinstall dist/*.whl
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
else
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
Expand Down
70 changes: 53 additions & 17 deletions .github/workflows/nightly_ondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,36 +66,41 @@ concurrency:
cancel-in-progress: ${{ github.event_name != 'schedule' }}

jobs:
Linux-Nightly-Ondemand-UT-Tests:
if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
uses: ./.github/workflows/_linux_ut.yml
Linux-Nightly-Ondemand-Build:
if: always()
name: linux-nightly-ondemand
permissions:
issues: write
uses: ./.github/workflows/_linux_build.yml
with:
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut' || inputs.ut }}
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
abi: 1
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
runner: linux.idc.xpu
runner: pvc_e2e

Linux-Weekly-UT-Tests-ABI-0:
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
Linux-Nightly-Ondemand-UT-Tests:
if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
name: linux-nightly-ondemand
needs: Linux-Nightly-Ondemand-Build
uses: ./.github/workflows/_linux_ut.yml
with:
abi: 0
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: op_regression,op_regression_dev1,op_extended,op_ut
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut' || inputs.ut }}
pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
runner: linux.idc.xpu

Linux-Nightly-Ondemand-E2E-Tests:
runs-on: pvc_e2e
name: linux-nightly-ondemand / e2e_test
# Don't run on forked repos
if: github.repository_owner == 'intel'
needs: Linux-Nightly-Ondemand-Build
timeout-minutes: 3600
env:
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
pytorch: ${{ needs.Linux-Nightly-Ondemand-Build.outputs.torch_commit_id }}
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
Expand Down Expand Up @@ -176,15 +181,19 @@ jobs:
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
- name: Build Pytorch XPU
- name: Download Pytorch wheel
if: ${{ inputs.pytorch }} != 'nightly_wheel'
uses: actions/download-artifact@v4
with:
name: Torch-XPU-Wheel-${{ github.event.pull_request.number || github.sha }}-${{ inputs.abi }}
path: ${{ github.workspace }}
- name: Install Pytorch XPU
run: |
source activate e2e_ci
source .github/scripts/env.sh
cd ../pytorch
pip install -r requirements.txt
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py bdist_wheel
pip install --force-reinstall dist/*.whl
pip install --force-reinstall ${{ github.workspace }}/torch*.whl
- name: Show GITHUB_ENV
run: |
echo "$GITHUB_ENV"
Expand Down Expand Up @@ -294,6 +303,33 @@ jobs:
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files

Linux-Nightly-Ondemand-Build-ABI-0:
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
name: linux-nightly-ondemand-abi0
permissions:
issues: write
uses: ./.github/workflows/_linux_build.yml
with:
pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
abi: 0
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
runner: pvc_e2e

Linux-Weekly-UT-Tests-ABI-0:
if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
name: linux-nightly-ondemand-abi0
needs: Linux-Nightly-Ondemand-Build-ABI-0
uses: ./.github/workflows/_linux_ut.yml
with:
abi: 0
keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
ut: op_regression,op_regression_dev1,op_extended,op_ut
pytorch: ${{ needs.Linux-Nightly-Ondemand-Build-ABI-0.outputs.torch_commit_id }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
triton: ${{ github.event_name == 'schedule' && '' || inputs.triton }}
runner: linux.idc.xpu

Tests-Failure-And-Report:
if: ${{ ! cancelled() }}
runs-on: [ self-hosted, Linux ]
Expand Down
Loading

0 comments on commit 299831d

Please sign in to comment.