Add distributed backend (XCCL) #4286
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pull | |
on: | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
- reopened | |
- converted_to_draft | |
- ready_for_review | |
branches: | |
- main | |
- release/* | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} | |
cancel-in-progress: true | |
permissions: read-all | |
jobs: | |
preci-ut: | |
# Don't run on forked repos and draft PRs | |
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} | |
uses: ./.github/workflows/_linux_ut.yml | |
with: | |
ut: op_regression,op_regression_dev1,op_extended,op_ut | |
runner: linux.idc.xpu | |
preci-ut-abi-0: | |
# Don't run on forked repos and draft PRs | |
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} | |
uses: ./.github/workflows/_linux_ut.yml | |
with: | |
abi: 0 | |
ut: op_extended | |
runner: linux.idc.xpu | |
Inductor-XPU-E2E-CI-Tests: | |
runs-on: pvc_e2e | |
# Don't run on forked repos and draft PRs | |
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }} | |
timeout-minutes: 900 | |
steps: | |
- name: Checkout torch-xpu-ops | |
uses: actions/checkout@v4 | |
- name: Prepare Conda ENV | |
run: | | |
which conda && conda clean -ay | |
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci | |
conda create -n e2e_ci python=3.10 cmake ninja -y | |
source activate e2e_ci | |
pip install mkl-static mkl-include | |
pip install pandas scipy tqdm | |
- name: Prepare Stock Pytorch | |
run: | | |
pwd | |
cd ../ && rm -rf pytorch | |
source activate e2e_ci | |
git clone -b main https://github.com/pytorch/pytorch pytorch | |
cd pytorch | |
# apply PRs for stock pytorch | |
pip install requests | |
# https://github.com/mengfei25/pytorch/pull/18 internal use only for subset model list | |
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/18 | |
git status && git show -s | |
git submodule sync && git submodule update --init --recursive | |
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/ | |
# Workaround for torch-xpu-ops ci test | |
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt | |
- name: Triton Installation | |
run: | | |
source activate e2e_ci | |
cd ../pytorch | |
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" | |
TRITON_PINNED_COMMIT=$(cat .ci/docker/ci_commit_pins/triton-xpu.txt) | |
echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT} | |
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python" | |
- name: Build Pytorch XPU | |
run: | | |
source activate e2e_ci | |
source .github/scripts/env.sh | |
cd ../pytorch | |
pip install -r requirements.txt | |
export CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}:${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | |
python setup.py bdist_wheel | |
pip install --force-reinstall dist/*.whl | |
- name: Identify pinned versions | |
run: | | |
cd ../pytorch | |
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" | |
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" | |
echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" | |
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" | |
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" | |
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" | |
- name: Torch Config | |
run: | | |
echo "$GITHUB_ENV" | |
rm -rf ../pytorch/inductor_log | |
rm -rf /tmp/torchinductor_* | |
cd .. | |
source activate e2e_ci | |
python -c "import triton; print(triton.__version__)" | |
python pytorch/torch/utils/collect_env.py | |
- name: Huggingface BF16 Training Accuracy Test | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
dt: bfloat16 | |
mode: training | |
scenario: accuracy | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Huggingface FP16 Training Accuracy Test | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: huggingface | |
dt: float16 | |
mode: training | |
scenario: accuracy | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Timm_models BF16 Training Accuracy Test | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: timm_models | |
dt: bfloat16 | |
mode: training | |
scenario: accuracy | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Torchbench BF16 Training Accuracy Test | |
uses: ./.github/actions/inductor-xpu-e2e-test | |
with: | |
suite: torchbench | |
dt: bfloat16 | |
mode: training | |
scenario: accuracy | |
env_prepare: true | |
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
- name: Summarize archieve files | |
if: ${{ ! cancelled() }} | |
run: | | |
rm -rf ${{ github.workspace }}/upload_files | |
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files | |
failed_case=$(grep "Real failed models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true) | |
if [ ${failed_case} -ne 0 ];then | |
grep -E "Real failed models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log | |
exit 1 | |
fi | |
- name: Upload Inductor XPU E2E Data | |
if: ${{ ! cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} | |
path: ${{ github.workspace }}/upload_files |