Skip to content

Inductor E2E Nightly Tests #39

Inductor E2E Nightly Tests

Inductor E2E Nightly Tests #39

name: Inductor E2E Nightly Tests
on:
workflow_dispatch:
inputs:
torchrepo:
description: 'torchrepo'
required: true
default: 'https://github.com/pytorch/pytorch.git'
torchbranch:
description: 'torchbranch'
required: true
default: 'v2.0.1'
torchcommit:
description: 'torchcommit'
required: true
default: 'e9ebda29d87ce0916ab08c06ab26fd3766a870e5'
ipexrepo:
description: 'ipexrepo'
required: true
default: 'https://github.com/intel/intel-extension-for-pytorch.git'
ipexbranch:
description: 'ipexbranch'
required: true
default: 'xpu-master'
ipexcommit:
description: 'ipexcommit'
required: true
default: '4af80f77740ed939be78eba28ae36951823f335c'
oneapi:
description: 'oneAPI basekit version'
required: true
default: '2023.2.0'
schedule:
- cron: "0 14 * * *" # run at 2 PM UTC
jobs:
Tests-Env-Prepare:
runs-on: [self-hosted, PVC_E2E]
steps:
- name: Create conda environment
run: |
source ${HOME}/miniconda3/bin/activate triton-nightly-test
conda install -y astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses mkl mkl-include
conda install -y -c conda-forge libstdcxx-ng
- name: Triton source code prepare
run: |
source ${HOME}/miniconda3/bin/activate triton-nightly-test
cd ${HOME}/triton-nightly
rm -rf triton
git clone https://github.com/openai/triton triton
cd triton
triton_commit=`git rev-parse HEAD`
echo "triton_commit: ${triton_commit}" | tee sw_info.log
git submodule sync
git submodule update --init --recursive --jobs 0
cd third_party/intel_xpu_backend
git checkout main && git pull
- name: Install Dependency
run: |
python --version
source ${HOME}/miniconda3/bin/activate triton-nightly-test
python --version
pip install setuptools cython numpy wheel scikit-build scipy
pip install psutil cpuid
cd ${HOME}/triton-nightly
cp triton/third_party/intel_xpu_backend/.github/scripts/env_prepare.sh .
cp triton/third_party/intel_xpu_backend/.github/scripts/env_triton.sh ${HOME}/
cp -r triton/third_party/intel_xpu_backend/.github/patches/ .
bash env_prepare.sh triton-nightly \
${{ github.event.inputs.torchrepo }} \
${{ github.event.inputs.torchbranch }} \
${{ github.event.inputs.torchcommit }} \
${{ github.event.inputs.ipexrepo }} \
${{ github.event.inputs.ipexbranch }} \
${{ github.event.inputs.ipexcommit }} \
${{ github.event.inputs.oneapi }}
source ${HOME}/env_triton.sh ${{ github.event.inputs.oneapi }}
python -c "import torch;import intel_extension_for_pytorch"
if [ ${PIPESTATUS[0]} -ne 0 ]; then
echo -e "[ERROR] Public-torch or IPEX BUILD FAIL"
exit 1
fi
- name: Build Triton
shell: bash
run: |
source ${HOME}/miniconda3/bin/activate triton-nightly-test
pip uninstall -y triton
sudo update-ca-certificates --fresh
export SSL_CERT_DIR=/etc/ssl/certs
pip install pybind11
cd ${HOME}/triton-nightly/triton/python
python setup.py clean
TRITON_CODEGEN_INTEL_XPU_BACKEND=1 python setup.py bdist_wheel
pip install dist/*.whl
cd ${HOME}/triton-nightly
source ${HOME}/env_triton.sh ${{ github.event.inputs.oneapi }}
python -c "import triton"
if [ ${PIPESTATUS[0]} -ne 0 ]; then
echo -e "[ERROR] Triton BUILD FAIL"
exit 1
fi
- name: Prepare Benchmark
run: |
cp ${HOME}/triton-nightly/triton/third_party/intel_xpu_backend/.github/scripts/inductor_xpu_test.sh ${HOME}/triton-nightly/pytorch
cp ${HOME}/triton-nightly/triton/third_party/intel_xpu_backend/.github/scripts/inductor_perf_summary.py ${HOME}/triton-nightly/pytorch
pip install styleFrame scipy pandas
Accuracy-Test:
needs: Tests-Env-Prepare
runs-on: [self-hosted, PVC_E2E]
steps:
- name: ACC Test for triton on PVC
run: |
echo -e "[ INFO ] Run E2E Acc test on Node $(hostname)"
source ${HOME}/miniconda3/bin/activate triton-nightly-test
source ${HOME}/env_triton.sh ${{ github.event.inputs.oneapi }}
cd ${HOME}/triton-nightly
bash set_proxy.sh
cd ${HOME}/triton-nightly/pytorch
rm -rf inductor_log
bash inductor_xpu_test.sh huggingface amp_bf16 inference accuracy xpu 0 & \
bash inductor_xpu_test.sh huggingface amp_bf16 training accuracy xpu 1 & \
bash inductor_xpu_test.sh huggingface amp_fp16 inference accuracy xpu 2 & \
bash inductor_xpu_test.sh huggingface amp_fp16 training accuracy xpu 3 & wait
cp ${HOME}/triton-nightly/triton/sw_info.log inductor_log/
- name: ACC Test Results Overview
run: |
cd ${HOME}/triton-nightly/pytorch/inductor_log/huggingface
cd amp_bf16
echo -e "============ Acc Check for HF amp_bf16 ============" | tee -a ./e2e_summary.log
csv_lines_inf=$(cat inductor_huggingface_amp_bf16_inference_xpu_accuracy.csv | wc -l)
let num_total_amp_bf16=csv_lines_inf-1
num_passed_amp_bf16_inf=$(grep "pass" inductor_huggingface_amp_bf16_inference_xpu_accuracy.csv | wc -l)
let num_failed_amp_bf16_inf=num_total_amp_bf16-num_passed_amp_bf16_inf
amp_bf16_inf_acc_pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed_amp_bf16_inf'/'$num_total_amp_bf16')*100}'`
echo "num_total_amp_bf16: $num_total_amp_bf16" | tee -a ./e2e_summary.log
echo "num_passed_amp_bf16_inf: $num_passed_amp_bf16_inf" | tee -a ./e2e_summary.log
echo "num_failed_amp_bf16_inf: $num_failed_amp_bf16_inf" | tee -a ./e2e_summary.log
echo "amp_bf16_inf_acc_pass_rate: $amp_bf16_inf_acc_pass_rate" | tee -a ./e2e_summary.log
num_passed_amp_bf16_tra=$(grep "pass" inductor_huggingface_amp_bf16_training_xpu_accuracy.csv | wc -l)
let num_failed_amp_bf16_tra=num_total_amp_bf16-num_passed_amp_bf16_tra
amp_bf16_tra_acc_pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed_amp_bf16_tra'/'$num_total_amp_bf16')*100}'`
echo "num_passed_amp_bf16_tra: $num_passed_amp_bf16_tra" | tee -a ./e2e_summary.log
echo "num_failed_amp_bf16_tra: $num_failed_amp_bf16_tra" | tee -a ./e2e_summary.log
echo "amp_bf16_tra_acc_pass_rate: $amp_bf16_tra_acc_pass_rate" | tee -a ./e2e_summary.log
cd ../amp_fp16
echo -e "============ Acc Check for HF amp_fp16 ============" | tee -a ./e2e_summary.log
csv_lines_inf=$(cat inductor_huggingface_amp_fp16_inference_xpu_accuracy.csv | wc -l)
let num_total_amp_fp16=csv_lines_inf-1
num_passed_amp_fp16_inf=$(grep "pass" inductor_huggingface_amp_fp16_inference_xpu_accuracy.csv | wc -l)
let num_failed_amp_fp16_inf=num_total_amp_fp16-num_passed_amp_fp16_inf
amp_fp16_inf_acc_pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed_amp_fp16_inf'/'$num_total_amp_fp16')*100}'`
echo "num_total_amp_fp16: $num_total_amp_fp16" | tee -a ./e2e_summary.log
echo "num_passed_amp_fp16_inf: $num_passed_amp_fp16_inf" | tee -a ./e2e_summary.log
echo "num_failed_amp_fp16_inf: $num_failed_amp_fp16_inf" | tee -a ./e2e_summary.log
echo "amp_fp16_inf_acc_pass_rate: $amp_fp16_inf_acc_pass_rate" | tee -a ./e2e_summary.log
num_passed_amp_fp16_tra=$(grep "pass" inductor_huggingface_amp_fp16_training_xpu_accuracy.csv | wc -l)
let num_failed_amp_fp16_tra=num_total_amp_fp16-num_passed_amp_fp16_tra
amp_fp16_tra_acc_pass_rate=`awk 'BEGIN{printf "%.2f%%\n",('$num_passed_amp_fp16_tra'/'$num_total_amp_fp16')*100}'`
echo "num_passed_amp_fp16_tra: $num_passed_amp_fp16_tra" | tee -a ./e2e_summary.log
echo "num_failed_amp_fp16_tra: $num_failed_amp_fp16_tra" | tee -a ./e2e_summary.log
echo "amp_fp16_tra_acc_pass_rate: $amp_fp16_tra_acc_pass_rate" | tee -a ./e2e_summary.log
- name: Upload Triton Inductor E2E Nightly Data
uses: actions/upload-artifact@v3
with:
name: Triton-Inductor-E2E-Nightly-Data
path: /home/gta/triton-nightly/pytorch/inductor_log/
- name: Test Results Check
run: |
cd ${HOME}/triton-nightly/pytorch/inductor_log/huggingface
cd amp_bf16
num_passed_amp_bf16_inf=$(grep "num_passed_amp_bf16_inf:" e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
if [ $num_passed_amp_bf16_inf -lt 45 ]; then
echo -e "[ERROR] Inductor E2E Nightly test for HF amp_bf16 inference passed_num < 45"
exit 1
fi
num_passed_amp_bf16_tra=$(grep "num_passed_amp_bf16_tra:" e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
if [ $num_passed_amp_bf16_tra -lt 42 ]; then
echo -e "[ERROR] Inductor E2E Nightly test for HF amp_bf16 training passed_num < 42"
exit 1
fi
cd ../amp_fp16
num_passed_amp_fp16_inf=$(grep "num_passed_amp_fp16_inf:" e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
if [ $num_passed_amp_fp16_inf -lt 45 ]; then
echo -e "[ERROR] Inductor E2E Nightly test for HF amp_fp16 inference passed_num < 45"
exit 1
fi
num_passed_amp_fp16_tra=$(grep "num_passed_amp_fp16_tra:" e2e_summary.log | sed -e 's/.*://;s/[^0-9.]//')
if [ $num_passed_amp_fp16_tra -lt 42 ]; then
echo -e "[ERROR] Inductor E2E Nightly test for HF amp_fp16 training passed_num < 42"
exit 1
fi
Performance-Test:
needs: Accuracy-Test
runs-on: [self-hosted, PVC_E2E]
steps:
- name: Perf Test for triton on PVC
run: |
echo -e "[ INFO ] Run E2E Perf test on Node $(hostname)"
source ${HOME}/miniconda3/bin/activate triton-nightly-test
source ${HOME}/env_triton.sh
cd ${HOME}/triton-nightly
bash set_proxy.sh
cd ${HOME}/triton-nightly/pytorch
bash inductor_xpu_test.sh huggingface amp_bf16 inference performance xpu 0 & \
bash inductor_xpu_test.sh huggingface amp_bf16 training performance xpu 1 & \
bash inductor_xpu_test.sh huggingface amp_fp16 inference performance xpu 2 & \
bash inductor_xpu_test.sh huggingface amp_fp16 training performance xpu 3 & wait
- name: Perf Test Results Generate and Overview
run: |
cd ${HOME}/triton-nightly/pytorch
python inductor_perf_summary.py -s huggingface -p amp_bf16 amp_fp16
- name: Upload Triton Inductor E2E Nightly Data
uses: actions/upload-artifact@v3
with:
name: Triton-Inductor-E2E-Nightly-Data
path: /home/gta/triton-nightly/pytorch/inductor_log/