Torch Nightly WHL Tests #128
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Torch Nightly WHL Tests | |
on: | |
schedule: | |
# GMT+8 21:00 every workday | |
- cron: '0 14 * * 0-4' | |
# GMT+8 0:00 Saturday | |
- cron: '0 17 * * 5' | |
workflow_dispatch: | |
inputs: | |
pytorch: | |
required: false | |
type: string | |
default: 'nightly' | |
description: Pytorch branch/commit | |
ut: | |
required: false | |
type: string | |
default: 'torch_xpu' | |
description: UT scope. `op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu`. Delimiter is comma | |
suite: | |
required: true | |
type: string | |
default: 'huggingface' | |
description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench`. Delimiter is comma | |
dt: | |
required: true | |
type: string | |
default: 'float32' | |
description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma | |
mode: | |
required: true | |
type: string | |
default: 'inference' | |
description: Test mode. `inference,training`. Delimiter is comma | |
scenario: | |
required: true | |
type: string | |
default: 'accuracy' | |
description: Test scenario. `accuracy,performance`. Delimiter is comma | |
model: | |
required: false | |
type: string | |
default: '' | |
description: Model. Will only run this one mode if set | |
python: | |
required: false | |
type: string | |
default: '3.10' | |
description: Python version | |
permissions: read-all | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.sha }}-${{ github.event_name }}-${{ inputs.pytorch }}-${{ inputs.ut }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.model }}-${{ inputs.python }} | |
cancel-in-progress: true | |
jobs: | |
Linux-Nightly-Ondemand-UT-WHL-Tests: | |
if: github.event_name == 'schedule' || ${{ inputs.ut_suite }} | |
uses: ./.github/workflows/_linux_ut.yml | |
with: | |
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
pytorch: nightly_wheel | |
runner: e2e_internal | |
Linux-Nightly-Ondemand-E2E-WHL-Tests: | |
runs-on: e2e_internal | |
# Don't run on forked repos | |
if: github.repository_owner == 'intel' | |
timeout-minutes: 3600 | |
env: | |
pytorch: ${{ github.event_name == 'schedule' && 'nightly' || inputs.pytorch }} | |
ut: ${{ github.event_name == 'schedule' && 'op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu' || inputs.ut }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
outputs: | |
TORCH_BRANCH_ID: ${{ steps.installed.outputs.TORCH_BRANCH_ID }} | |
TORCH_COMMIT_ID: ${{ steps.installed.outputs.TORCH_COMMIT_ID }} | |
TORCH_XPU_OPS_COMMIT: ${{ steps.installed.outputs.TORCH_XPU_OPS_COMMIT }} | |
TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }} | |
TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }} | |
TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }} | |
TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }} | |
TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }} | |
TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }} | |
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} | |
KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} | |
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} | |
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} | |
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} | |
TIMEOUT_MODELS: ${{ steps.summary.outputs.TIMEOUT_MODELS }} | |
steps: | |
- name: Checkout torch-xpu-ops | |
uses: actions/checkout@v4 | |
- name: Prepare Conda ENV | |
run: | | |
rm -rf myvenv-e2e | |
/usr/bin/python3.10 -m venv myvenv-e2e | |
source myvenv-e2e/bin/activate | |
pip install mkl-static==2025.0.1 mkl-include==2025.0.1 | |
pip install pandas scipy tqdm | |
- name: Prepare Stock Pytorch | |
id: installed | |
run: | | |
pwd | |
source myvenv-e2e/bin/activate | |
pip install --pre torch==2.6.0.dev20241202+xpu torchvision==0.20.0.dev20241202+xpu torchaudio==2.5.0.dev20241202+xpu --index-url https://download.pytorch.org/whl/nightly/xpu | |
echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
source .github/scripts/env.sh | |
cd ../ && rm -rf pytorch | |
git clone https://github.com/pytorch/pytorch pytorch | |
cd pytorch && git checkout ${TORCH_COMMIT_ID} | |
# apply PRs for stock pytorch | |
pip install requests | |
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py | |
git status && git show -s | |
pip install -r requirements.txt | |
echo "TORCH_XPU_OPS_COMMIT=$(<third_party/xpu.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
rm -rf third_party/torch-xpu-ops | |
git clone https://github.com/intel/torch-xpu-ops.git third_party/torch-xpu-ops | |
cd third_party/torch-xpu-ops | |
git checkout ${TORCH_XPU_OPS_COMMIT} | |
- name: Identify pinned versions | |
id: pinned | |
run: | | |
source myvenv-e2e/bin/activate | |
source .github/scripts/env.sh | |
echo "TORCHVISION_COMMIT_ID=$(python -c 'import torchvision; print(torchvision.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TORCHAUDIO_COMMIT_ID=$(python -c 'import torchaudio; print(torchaudio.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TRITON_COMMIT_ID=$(python -c 'import triton; print(triton.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
cd ../pytorch | |
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "BUNDLE_VERSION=" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
. /etc/os-release | |
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" | |
echo ${GITHUB_ENV} | |
- name: Show GITHUB_ENV | |
run: | | |
echo "$GITHUB_ENV" | |
rm -rf ../pytorch/inductor_log | |
rm -rf /tmp/torchinductor_* | |
# Nihglty launch | |
- name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Nightly Torchbench BF16 Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: torchbench | |
dt: bfloat16 | |
mode: training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Nightly Timm_models FP16 Training Accuracy Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 14 * * 0-4' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: timm_models | |
dt: float16 | |
mode: training | |
scenario: accuracy | |
pytorch: nightly_wheel | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
# Weekly launch | |
- name: Weekly Huggingface Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Weekly Torchbench Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: torchbench | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Weekly Timm_models Full Test | |
if: github.event_name == 'schedule' && github.event.schedule == '0 17 * * 5' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: timm_models | |
env_prepare: true | |
dt: float32,bfloat16,float16,amp_bf16,amp_fp16 | |
mode: inference,training | |
scenario: accuracy,performance | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
# On-demand launch | |
- name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) | |
if: github.event_name != 'schedule' | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: ${{ inputs.suite }} | |
env_prepare: true | |
dt: ${{ inputs.dt }} | |
mode: ${{ inputs.mode }} | |
scenario: ${{ inputs.scenario }} | |
pytorch: nightly_wheel | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Summarize archieve files | |
id: summary | |
if: ${{ ! cancelled() }} | |
run: | | |
rm -rf ${{ github.workspace }}/upload_files | |
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files | |
mkdir -p ${{ github.workspace }}/../../_backup/ && cd ${{ github.workspace }}/../../_backup/ | |
find . -type f -name "*.tgz" -mtime +3 -delete # delete files older than 3 days | |
tar zcf xpu-inductor-${GITHUB_RUN_ID}.tgz -C ${{ github.workspace }}/upload_files/ . # backup logs | |
failed_models=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) | |
timeout_models=$(grep "timeout models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) | |
if [ ${timeout_models} -ne 0 ];then | |
TIMEOUT_MODELS="$( | |
grep -B 1 "timeout models: [1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log | |
)" | |
echo "TIMEOUT_MODELS=\"${TIMEOUT_MODELS}\"" |awk '{printf("%s\\n", $0)}' |sed 's/\\n$//' |tee -a "${GITHUB_OUTPUT}" | |
fi | |
if [ ${failed_models} -ne 0 ];then | |
grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/upload_files/summary_accuracy.log |grep "failed" -B 1 | |
exit 1 | |
fi | |
- name: Upload Inductor XPU E2E Data | |
if: ${{ ! cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} | |
path: ${{ github.workspace }}/upload_files | |
Tests-Failure-And-Report: | |
if: ${{ ! cancelled() }} | |
runs-on: [ self-hosted, Linux ] | |
permissions: | |
issues: write | |
env: | |
GH_TOKEN: ${{ github.token }} | |
python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }} | |
needs: Linux-Nightly-Ondemand-E2E-WHL-Tests | |
steps: | |
- name: Report github issue for XPU OPS nightly | |
if: github.repository_owner == 'intel' | |
run: | | |
set -xe | |
# Test env | |
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
repo="${{ github.repository }}" | |
TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" | |
TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" | |
TORCH_XPU_OPS_COMMIT="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_XPU_OPS_COMMIT }}" | |
DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" | |
KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" | |
BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" | |
OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}" | |
GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}" | |
TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHBENCH_COMMIT_ID }}" | |
TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHVISION_COMMIT_ID }}" | |
TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCHAUDIO_COMMIT_ID }}" | |
TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRANSFORMERS_VERSION }}" | |
TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMM_COMMIT_ID }}" | |
TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TRITON_COMMIT_ID }}" | |
TIMEOUT_MODELS="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TIMEOUT_MODELS }}" | |
# Test status | |
if [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "success" ];then | |
test_status=Success | |
elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.result }}" == "failure" ];then | |
test_status=Failure | |
cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}" | |
else | |
test_status=None | |
exit 0 | |
fi | |
# Test Type | |
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then | |
test_type="On-demand" | |
test_issue_id=426 | |
cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" | |
elif [ "${{ github.event.schedule }}" == "0 17 * * 5" ];then | |
test_type="Weekly" | |
test_issue_id=432 | |
else | |
test_type="Nightly" | |
test_issue_id=432 | |
fi | |
# Test report | |
echo -e "**${test_status}** $test_type WHL Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt | |
printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${TORCH_XPU_OPS_COMMIT:0:7} on pinned | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt | |
echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt | |
printf "Transformers | Timm | Torchbench | Torchvision | Torchaudio\n--- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt | |
printf "[${TRANSFORMERS_VERSION:0:7}](https://github.com/huggingface/transformers/commit/${TRANSFORMERS_VERSION:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TIMM_COMMIT_ID:0:7}](https://github.com/huggingface/pytorch-image-models/commit/${TIMM_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt | |
echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt | |
printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt | |
echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt | |
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then | |
test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" | |
if [ "${{ inputs.model }}" != "" ];then | |
test_scope+="; model=${{ inputs.model }}" | |
fi | |
echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt | |
fi | |
echo "$TIMEOUT_MODELS" |awk '{printf("%s\\n", $0)}' >> ${{ github.workspace }}/report.txt | |
echo "$cc_comment" >> ${{ github.workspace }}/report.txt | |
# Report | |
report_txt=$(cat ${{ github.workspace }}/report.txt) | |
gh --repo $repo issue comment $test_issue_id --body "$report_txt" |