-
Notifications
You must be signed in to change notification settings - Fork 23
154 lines (148 loc) · 6.02 KB
/
pull.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
name: pull
on:
pull_request:
types:
- opened
- synchronize
- reopened
- converted_to_draft
- ready_for_review
branches:
- main
- release/*
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress: true
permissions: read-all
jobs:
preci-ut:
# Don't run on forked repos and draft PRs
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
uses: ./.github/workflows/_linux_ut.yml
with:
pytorch: release/2.5
ut: op_example,op_extended,op_ut
runner: linux.idc.xpu
preci-ut-abi-0:
# Don't run on forked repos and draft PRs
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
uses: ./.github/workflows/_linux_ut.yml
with:
pytorch: release/2.5
abi: 0
ut: op_extended
runner: linux.idc.xpu
Inductor-XPU-E2E-CI-Tests:
runs-on: pvc_e2e
# Don't run on forked repos and draft PRs
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
timeout-minutes: 900
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Conda ENV
run: |
which conda && conda clean -ay
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=3.10 cmake ninja -y
source activate e2e_ci
pip install mkl-static mkl-include
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
pwd
cd ../ && rm -rf pytorch
source activate e2e_ci
git clone -b release/2.5 https://github.com/pytorch/pytorch pytorch
cd pytorch
# apply PRs for stock pytorch
pip install requests
# https://github.com/mengfei25/pytorch/pull/18 internal use only for subset model list
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/18
git status && git show -s
git submodule sync && git submodule update --init --recursive
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
# Workaround for torch-xpu-ops ci test
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
- name: Triton Installation
run: |
source activate e2e_ci
cd ../pytorch
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
TRITON_PINNED_COMMIT=$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)
echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
- name: Build Pytorch XPU
run: |
source activate e2e_ci
source .github/scripts/env.sh
cd ../pytorch
pip install -r requirements.txt
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py bdist_wheel
pip install --force-reinstall dist/*.whl
- name: Identify pinned versions
run: |
cd ../pytorch
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
- name: Show GITHUB_ENV
run: |
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log
rm -rf /tmp/torchinductor_*
- name: Huggingface BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: bfloat16
mode: training
scenario: accuracy
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Huggingface FP16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: float16
mode: training
scenario: accuracy
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Timm_models BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
dt: bfloat16
mode: training
scenario: accuracy
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Torchbench BF16 Training Accuracy Test
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
dt: bfloat16
mode: training
scenario: accuracy
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Summarize archieve files
if: ${{ ! cancelled() }}
run: |
rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
if [ ${failed_case} -ne 0 ];then
grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
exit 1
fi
- name: Upload Inductor XPU E2E Data
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files