Skip to content

Torch Nightly WHL Tests #143

Torch Nightly WHL Tests

Torch Nightly WHL Tests #143

name: Torch Nightly WHL Tests
on:
schedule:
# GMT+8 21:00 every workday
- cron: '0 14 * * 0-4'
# GMT+8 0:00 Saturday
- cron: '0 17 * * 5'
workflow_dispatch:
inputs:
pytorch:
required: false
type: string
default: 'nightly'
description: Pytorch branch/commit
ut:
required: false
type: string
default: 'torch_xpu'
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu`. Delimiter is comma
suite:
required: true
type: string
default: 'huggingface'
description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench`. Delimiter is comma
dt:
required: true
type: string
default: 'float32'
description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
mode:
required: true
type: string
default: 'inference'
description: Test mode. `inference,training`. Delimiter is comma
scenario:
required: true
type: string
default: 'accuracy'
description: Test scenario. `accuracy,performance`. Delimiter is comma
model:
required: false
type: string
default: ''
description: Model. Will only run this one mode if set
python:
required: false
type: string
default: '3.10'
description: Python version
permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }}
cancel-in-progress: true
jobs:
Linux-Nightly-Ondemand-UT-WHL-Tests:
uses: ./.github/workflows/_linux_ut.yml
with:
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
pytorch: nightly_wheel
runner: e2e_internal
Linux-Nightly-Ondemand-E2E-WHL-Tests:
runs-on: e2e_internal
# Don't run on forked repos
if: github.repository_owner == 'intel'
timeout-minutes: 36000
env:
pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }}
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }}
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
outputs:
TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }}
TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }}
TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }}
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }}
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }}
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Conda ENV
run: |
rm -rf ${HOME}/mengfeil/myvenv-e2e
/usr/bin/python3.10 -m venv ${HOME}/mengfeil/myvenv-e2e
source ${HOME}/mengfeil/myvenv-e2e/bin/activate
# pip install mkl-static==2025.0.1 mkl-include==2025.0.1
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
id: installed
run: |
pwd
source ${HOME}/mengfeil/myvenv-e2e/bin/activate
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
echo "TORCH_COMMIT_ID=${TORCH_COMMIT_ID}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
cd ../ && rm -rf pytorch
git clone https://github.com/pytorch/pytorch pytorch
cd pytorch && git checkout ${TORCH_COMMIT_ID}
# apply PRs for stock pytorch
pip install requests
# python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
pip install -r requirements.txt
TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)
echo "TORCH_XPU_OPS_COMMIT=${TORCH_XPU_OPS_COMMIT}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
rm -rf third_party/torch-xpu-ops
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops
cd third_party/torch-xpu-ops
git checkout ${TORCH_XPU_OPS_COMMIT}
- name: Identify pinned versions
id: pinned
run: |
source ${HOME}/mengfeil/myvenv-e2e/bin/activate
echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
cd ../pytorch
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "BUNDLE_VERSION=$(pip list |grep cmplr |head -n 1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo ${GITHUB_ENV}
- name: Show GITHUB_ENV
run: |
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log
rm -rf /tmp/torchinductor_*
# Nihglty launch
- name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy
pytorch: nightly_wheel
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Torchbench BF16 Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
dt: bfloat16
mode: training
scenario: accuracy
pytorch: nightly_wheel
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Timm_models FP16 Training Accuracy Test
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
dt: float16
mode: training
scenario: accuracy
pytorch: nightly_wheel
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# Weekly launch
- name: Weekly Huggingface Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
pytorch: nightly_wheel
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Weekly Torchbench Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
pytorch: nightly_wheel
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Weekly Timm_models Full Test
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
env_prepare: true
dt: float32,bfloat16,float16,amp_bf16,amp_fp16
mode: inference,training
scenario: accuracy,performance
pytorch: nightly_wheel
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
# On-demand launch
- name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
if: github.event_name != 'schedule'
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: ${{ inputs.suite }}
env_prepare: true
dt: ${{ inputs.dt }}
mode: ${{ inputs.mode }}
scenario: ${{ inputs.scenario }}
pytorch: nightly_wheel
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Summarize archieve files
id: summary
if: ${{ ! cancelled() }}
run: |
rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/
find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days
tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs
- name: Upload Inductor XPU E2E Data
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files