Torch Nightly WHL Tests #121
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Torch Nightly WHL Tests | |
on: | |
schedule: | |
# GMT+8 21:00 every workday | |
- cron: '0 14 * * 0-4' | |
# GMT+8 0:00 Saturday | |
- cron: '0 17 * * 5' | |
workflow_dispatch: | |
inputs: | |
pytorch: | |
required: false | |
type: string | |
default: 'nightly' | |
description: Pytorch branch/commit | |
ut: | |
required: false | |
type: string | |
default: 'torch_xpu' | |
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu`. Delimiter is comma | |
suite: | |
required: true | |
type: string | |
default: 'huggingface' | |
description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench`. Delimiter is comma | |
dt: | |
required: true | |
type: string | |
default: 'float32' | |
description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma | |
mode: | |
required: true | |
type: string | |
default: 'inference' | |
description: Test mode. `inference,training`. Delimiter is comma | |
scenario: | |
required: true | |
type: string | |
default: 'accuracy' | |
description: Test scenario. `accuracy,performance`. Delimiter is comma | |
model: | |
required: false | |
type: string | |
default: '' | |
description: Model. Will only run this one mode if set | |
python: | |
required: false | |
type: string | |
default: '3.10' | |
description: Python version | |
permissions: read-all | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} | |
cancel-in-progress: true | |
jobs: | |
Linux-Nightly-Ondemand-UT-WHL-Tests: | |
if: github.event_name == 'schedule' || ${{ inputs.ut_suite }} | |
uses: ./.github/workflows/_linux_ut.yml | |
with: | |
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
pytorch: nightly_wheel | |
runner: e2e_internal | |
Linux-Nightly-Ondemand-E2E-WHL-Tests: | |
runs-on: e2e_internal | |
# Don't run on forked repos | |
if: github.repository_owner == 'intel' | |
timeout-minutes: 3600 | |
env: | |
pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }} | |
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
outputs: | |
TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }} | |
TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }} | |
TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }} | |
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} | |
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} | |
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} | |
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} | |
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} | |
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} | |
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} | |
KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} | |
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} | |
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} | |
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} | |
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} | |
steps: | |
- name: Checkout torch-xpu-ops | |
uses: actions/checkout@v4 | |
- name: Prepare Conda ENV | |
run: | | |
rm -rf myvenv-e2e | |
/usr/bin/python3.10 -m venv myvenv-e2e | |
source myvenv-e2e/bin/activate | |
pip install mkl-static==2025.0.1 mkl-include==2025.0.1 | |
pip install pandas scipy tqdm | |
- name: Prepare Stock Pytorch | |
id: installed | |
run: | | |
pwd | |
source myvenv-e2e/bin/activate | |
pip install --pre torch==2.6.0.dev20241202+xpu torchvision==0.20.0.dev20241202+xpu torchaudio==2.5.0.dev20241202+xpu --index-url https://download.pytorch.org/whl/nightly/xpu | |
echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
source .github/scripts/env.sh | |
cd ../ && rm -rf pytorch | |
git clone https://github.com/pytorch/pytorch pytorch | |
cd pytorch && git checkout ${TORCH_COMMIT_ID} | |
# apply PRs for stock pytorch | |
pip install requests | |
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py | |
git status && git show -s | |
pip install -r requirements.txt | |
echo "TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
rm -rf third_party/torch-xpu-ops | |
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops | |
cd third_party/torch-xpu-ops | |
git checkout ${TORCH_XPU_OPS_COMMIT} | |
- name: Identify pinned versions | |
id: pinned | |
run: | | |
source myvenv-e2e/bin/activate | |
source .github/scripts/env.sh | |
echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
cd ../pytorch | |
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
. /etc/os-release | |
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo ${GITHUB_ENV} | |
- name: Show GITHUB_ENV | |
run: | | |
echo "$GITHUB_ENV" | |
rm -rf ../pytorch/inductor_log | |
rm -rf /tmp/torchinductor_* | |
# Nihglty launch | |
- name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Nightly Torchbench BF16 Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: torchbench | |
dt: bfloat16 | |
mode: training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Nightly Timm_models FP16 Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: timm_models | |
dt: float16 | |
mode: training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
# Weekly launch | |
- name: Weekly Huggingface Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Weekly Torchbench Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: torchbench | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Weekly Timm_models Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: timm_models | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
# On-demand launch | |
- name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) | |
if: github.event_name != 'schedule' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: ${{ inputs.suite }} | |
env_prepare: true | |
dt: ${{ inputs.dt }} | |
mode: ${{ inputs.mode }} | |
scenario: ${{ inputs.scenario }} | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Summarize archieve files | |
id: summary | |
if: ${{ ! cancelled() }} | |
run: | | |
rm -rf ${{ github.workspace }}/upload_files | |
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files | |
mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ | |
find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days | |
tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs | |
failed_models=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) | |
timeout_models=$(grep "timeout models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) | |
if [ ${timeout_models} -ne 0 ];then | |
TIMEOUT_MODELS="$( | |
grep -B 1 "timeout models: [1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log | |
)" | |
echo "TIMEOUT_MODELS=\"${TIMEOUT_MODELS}\"" |awk '{printf("%s\\n", $0)}' |sed 's/\\n$//' |tee -a "${GITHUB_OUTPUT}" | |
fi | |
if [ ${failed_models} -ne 0 ];then | |
grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log |grep "failed" -B 1 | |
exit 1 | |
fi | |
- name: Upload Inductor XPU E2E Data | |
if: ${{ ! cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} | |
path: ${{ github.workspace }}/upload_files | |
Tests-Failure-And-Report: | |
if: ${{ ! cancelled() }} | |
runs-on: [ self-hosted, Linux ] | |
permissions: | |
issues: write | |
env: | |
GH_TOKEN: ${{ github.token }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
needs: Linux-Nightly-Ondemand-E2E-WHL-Tests | |
steps: | |
- name: Report github issue for XPU OPS nightly | |
if: github.repository_owner == 'intel' | |
run: | | |
set -xe | |
# Test env | |
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
repo="${{ github.repository }}" | |
TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" | |
TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" | |
TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}" | |
DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" | |
KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" | |
BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" | |
OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}" | |
GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}" | |
TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHBENCH_COMMIT_ID }}" | |
TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHVISION_COMMIT_ID }}" | |
TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" | |
TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRANSFORMERS_VERSION }}" | |
TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMM_COMMIT_ID }}" | |
TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRITON_COMMIT_ID }}" | |
TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMEOUT_MODELS }}" | |
# Test status | |
if [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "success" ];then | |
test_status=Success | |
elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "failure" ];then | |
test_status=Failure | |
cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" | |
else | |
test_status=None | |
exit 0 | |
fi | |
# Test Type | |
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then | |
test_type="On-demand" | |
test_issue_id=426 | |
cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" | |
elif [ "${{ github.event.schedule }}" == "0 17 * * 5" ];then | |
test_type="Weekly" | |
test_issue_id=432 | |
else | |
test_type="Nightly" | |
test_issue_id=432 | |
fi | |
# Test report | |
echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt | |
printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt | |
echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt | |
printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt | |
printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt | |
printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt | |
echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt | |
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then | |
test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" | |
if [ "${{ inputs.model }}" != "" ];then | |
test_scope+="; model=${{ inputs.model }}" | |
fi | |
echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt | |
fi | |
echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt | |
echo "$cc_comment" >> ${{ github.workspace }}/report.txt | |
# Report | |
report_txt=$(cat ${{ github.workspace }}/report.txt) | |
gh --repo $repo issue comment $test_issue_id --body "$report_txt" |