From e2e802f5ee117b1544eb75928a7c15eccdfe12fa Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Fri, 20 Dec 2024 08:39:23 -0800 Subject: [PATCH 1/7] ci: print annotations for key package versions in transformers test (#1184) Annotations are available on a summary page of executed workflow. --------- Signed-off-by: Dmitry Rogozhkin --- .github/workflows/_linux_transformers.yml | 50 +++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index fd099fcb6..95aee8e7e 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -50,6 +50,7 @@ jobs: DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }} + transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' steps: - name: Checkout torch-xpu-ops @@ -60,7 +61,7 @@ jobs: uses: actions/checkout@v4 with: repository: huggingface/transformers - ref: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} + ref: ${{ env.transformers }} path: transformers - name: Prepare OS environment run: | @@ -106,12 +107,54 @@ jobs: id: installed run: | source activate huggingface_transformers_test - echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "pip installed packages:" pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt + echo "lspci gpu devices:" + lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt echo "GPU render nodes:" cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt + # printing annotations for the key packages + echo "### Annotations" >> $GITHUB_STEP_SUMMARY + echo "| | |" >> $GITHUB_STEP_SUMMARY + echo "| --- | --- |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" >> $GITHUB_STEP_SUMMARY + packages=" \ + level-zero \ + libigc1 \ + libigc2 \ + libze1 \ + libze-intel-gpu1 \ + intel-i915-dkms \ + intel-level-zero-gpu \ + intel-opencl-icd" + for package in $packages; do + package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY + done + packages="accelerate \ + numpy \ + torch \ + torchaudio \ + torchvision \ + transformers" + for package in $packages; do + package_version=$(python -c "import $package; print($package.__version__)") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY + done + # printing annotations for GPU cards + var="[$(cat /sys/class/drm/render*/device/vendor)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY + var="[$(cat /sys/class/drm/render*/device/device)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY + var=$(python -c "import torch; print(torch.version.xpu)") + echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" >> $GITHUB_STEP_SUMMARY + var=$(python -c "import torch; print(torch.xpu.device_count())") + echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" >> $GITHUB_STEP_SUMMARY + # printing annotations with key environment variables + echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" >> $GITHUB_STEP_SUMMARY - name: Sanitry check installed packages run: | source activate huggingface_transformers_test @@ -120,6 +163,7 @@ jobs: pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu + python -c 'import torch; exit(not torch.xpu.is_available())' - name: Run XPU backbone run: | source activate huggingface_transformers_test From 7137aeb9fcda0ce3344b9d02a1d47fc8bc35f430 Mon Sep 17 00:00:00 2001 From: "Wang, Chuanqi" Date: Sun, 22 Dec 2024 20:04:31 +0800 Subject: [PATCH 2/7] [CI] Add ccl and mpi env source for XCCL backend related PR test (#1179) --- .github/scripts/env.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 4fd192c06..9cfd67477 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -4,6 +4,8 @@ if [ "$1" != "nightly_wheel" ];then source /opt/intel/oneapi/compiler/latest/env/vars.sh source /opt/intel/oneapi/umf/latest/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh + source /opt/intel/oneapi/ccl/latest/env/vars.sh + source /opt/intel/oneapi/mpi/latest/env/vars.sh else echo "Don't need to source DL-Essential for nightly wheel" fi From 6899263daf50a9848b47c561a6e08d72991f52dc Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Sun, 22 Dec 2024 21:14:13 -0800 Subject: [PATCH 3/7] ci/transformers: add pipeline and trainer tests (#1185) Changes: * Add testing of tests/pipelines * Add testing of tests/trainer * Add printing results summary (to workload summary page) * Add printing failure lines (to workload summary page) Summary page contains sections in this order: * Results summary * Failure lines * Annotations --------- Signed-off-by: Dmitry Rogozhkin --- .github/workflows/_linux_transformers.yml | 168 ++++++++++++++++------ 1 file changed, 123 insertions(+), 45 deletions(-) diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index 95aee8e7e..65dde1b6d 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -104,7 +104,6 @@ jobs: rm -rf reports cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ - name: Report installed versions - id: installed run: | source activate huggingface_transformers_test echo "pip installed packages:" @@ -113,48 +112,6 @@ jobs: lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt echo "GPU render nodes:" cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt - # printing annotations for the key packages - echo "### Annotations" >> $GITHUB_STEP_SUMMARY - echo "| | |" >> $GITHUB_STEP_SUMMARY - echo "| --- | --- |" >> $GITHUB_STEP_SUMMARY - echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" >> $GITHUB_STEP_SUMMARY - echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY - echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" >> $GITHUB_STEP_SUMMARY - packages=" \ - level-zero \ - libigc1 \ - libigc2 \ - libze1 \ - libze-intel-gpu1 \ - intel-i915-dkms \ - intel-level-zero-gpu \ - intel-opencl-icd" - for package in $packages; do - package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") - echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY - done - packages="accelerate \ - numpy \ - torch \ - torchaudio \ - torchvision \ - transformers" - for package in $packages; do - package_version=$(python -c "import $package; print($package.__version__)") - echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY - done - # printing annotations for GPU cards - var="[$(cat /sys/class/drm/render*/device/vendor)]" - echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY - var="[$(cat /sys/class/drm/render*/device/device)]" - echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY - var=$(python -c "import torch; print(torch.version.xpu)") - echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" >> $GITHUB_STEP_SUMMARY - var=$(python -c "import torch; print(torch.xpu.device_count())") - echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" >> $GITHUB_STEP_SUMMARY - # printing annotations with key environment variables - echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" >> $GITHUB_STEP_SUMMARY - echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" >> $GITHUB_STEP_SUMMARY - name: Sanitry check installed packages run: | source activate huggingface_transformers_test @@ -164,11 +121,132 @@ jobs: pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - - name: Run XPU backbone + - name: Run -k backbone tests run: | source activate huggingface_transformers_test cd transformers - python3 -m pytest -rsf --make-reports=tests_benchmark -k backbone tests + python3 -m pytest -rsf --make-reports=tests_backbone -k backbone tests + - name: Run tests/pipelines + run: | + source activate huggingface_transformers_test + cd transformers + # Some tests are known to fail w/o clear pattern + # TODO: drop ||true after triage and fixes + python3 -m pytest -rsf --make-reports=tests_pipelines tests/pipelines || true + - name: Run tests/trainer + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * Some ray tests hang, reason unknown + # * torch.distributed.* not yet supported by XPU + pattern=" \ + not ray and \ + not TestTrainerDistributed and \ + not TestTrainerDistributedXPU and \ + not TestFSDPTrainer" + python3 -m pytest -rsf --make-reports=tests_trainer tests/trainer -k "$pattern" + - name: Print results table + if: ${{ ! cancelled() }} + run: | + # Helper function to return number preceeding given pattern, i.e: + # === 25 failed, 11 warnings, 0 errors === + # Call as follows: + # parse_stat $line "failed" + function parse_stat() { + stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/") + if [ -n "$stat" ]; then echo $stat; else echo "0"; fi + } + cd transformers + { + echo "### Results" + echo "| Test group | Errors | Failed | Passed | Skipped |" + echo "| --- | --- | --- | --- | --- |" + for stat in $(find reports -name stats.txt); do + # Each stat.txt is located in: reports/$test_group/stats.txt + test_group=$(echo $stat | cut -f 2 -d/) + # Get failed, passed, skipped, etc. counters + failed=$(parse_stat $stat failed) + passed=$(parse_stat $stat passed) + skipped=$(parse_stat $stat skipped) + warnings=$(parse_stat $stat warnings) + errors=$(parse_stat $stat errors) + echo "| $test_group | $errors | $failed | $passed | $skipped |" + done + } >> $GITHUB_STEP_SUMMARY + - name: Print failure lines + if: ${{ ! cancelled() }} + run: | + cd transformers + { + echo "### Failure lines" + echo "| File | Error | Comment |" + echo "| --- | --- | --- |" + rm -rf _failures.txt + for failure in $(find reports -name failures_line.txt); do + tail -n +2 $failure >> _failures.txt + done + # failures_line.txt file does not have test case information, + # so we can just sort the output and report uniq values + sort _failures.txt | uniq > _failures_uniq.txt + while read line; do + file=$(echo $line | cut -f1 -d" " | sed "s/\(.*\):$/\1/") + error=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") + # Failure comments often contain special characters which complicate + # parsing failure lines. But fortunately we know for sure where comments + # start. So we just output all contents starting from this position and + # wrap everything in
 to avoid collisions with Markdown formatting.
+              comment="
$(echo $line | cut -f3- -d' ' | sed 's/\(.*\):$/\1/')
" + echo "| $file | $error | $comment |" + done <_failures_uniq.txt + } >> $GITHUB_STEP_SUMMARY + - name: Print annotations + if: ${{ ! cancelled() }} + run: | + source activate huggingface_transformers_test + { + echo "### Annotations" + echo "| | |" + echo "| --- | --- |" + echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" + echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" + echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" + packages=" \ + level-zero \ + libigc1 \ + libigc2 \ + libze1 \ + libze-intel-gpu1 \ + intel-i915-dkms \ + intel-level-zero-gpu \ + intel-opencl-icd" + for package in $packages; do + package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + packages="accelerate \ + numpy \ + torch \ + torchaudio \ + torchvision \ + transformers" + for package in $packages; do + package_version=$(python -c "import $package; print($package.__version__)" || true) + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + # printing annotations for GPU cards + var="[$(cat /sys/class/drm/render*/device/vendor || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |" + var="[$(cat /sys/class/drm/render*/device/device || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |" + var=$(python -c "import torch; print(torch.version.xpu)" || true) + echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" + var=$(python -c "import torch; print(torch.xpu.device_count())" || true) + echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" + # printing annotations with key environment variables + echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" + echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" + } >> $GITHUB_STEP_SUMMARY - name: Upload Test log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 From 212ee906b6d1aa4c0360395803cc2ee43b2d9741 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Mon, 23 Dec 2024 18:09:35 -0800 Subject: [PATCH 4/7] ci/transformers: run tests in utils, benchmark, generation, models (#1190) Signed-off-by: Dmitry Rogozhkin --- .github/workflows/_linux_transformers.yml | 93 +++++++++++++++++++---- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index 65dde1b6d..f79227658 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -122,18 +122,64 @@ jobs: pip show torchvision | grep Version | grep xpu python -c 'import torch; exit(not torch.xpu.is_available())' - name: Run -k backbone tests + env: + TEST_CASE: 'tests_backbone' run: | source activate huggingface_transformers_test cd transformers - python3 -m pytest -rsf --make-reports=tests_backbone -k backbone tests + python3 -m pytest -rsf --make-reports=$TEST_CASE -k backbone tests || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Run tests/*.py + env: + TEST_CASE: 'tests_py' + run: | + source activate huggingface_transformers_test + cd transformers + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/*.py || true + - name: Run tests/benchmark + env: + TEST_CASE: 'tests_benchmark' + run: | + source activate huggingface_transformers_test + cd transformers + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/benchmark || true + - name: Run tests/generation + env: + TEST_CASE: 'tests_generation' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * torch.distributed.* not yet supported by XPU + pattern="not TestFSDPGeneration" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/generation -k "$pattern" || true + - name: Run tests/models + env: + TEST_CASE: 'tests_models' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * https://github.com/huggingface/transformers/issues/35252 (CUDA specific tests) + # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) + pattern=" \ + not test_model_parallelization and \ + not test_model_parallel_equal_results and \ + not test_resize_embeddings_untied and \ + not test_resize_tokens_embeddings" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/models -k "$pattern" || true - name: Run tests/pipelines + env: + TEST_CASE: 'tests_pipelines' run: | source activate huggingface_transformers_test cd transformers # Some tests are known to fail w/o clear pattern # TODO: drop ||true after triage and fixes - python3 -m pytest -rsf --make-reports=tests_pipelines tests/pipelines || true + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/pipelines || true - name: Run tests/trainer + env: + TEST_CASE: 'tests_trainer' run: | source activate huggingface_transformers_test cd transformers @@ -145,7 +191,24 @@ jobs: not TestTrainerDistributed and \ not TestTrainerDistributedXPU and \ not TestFSDPTrainer" - python3 -m pytest -rsf --make-reports=tests_trainer tests/trainer -k "$pattern" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/trainer -k "$pattern" || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Run tests/utils + env: + TEST_CASE: 'tests_utils' + run: | + source activate huggingface_transformers_test + cd transformers + # Excluding tests due to: + # * Network proxy connection issue, reason unknown + pattern="not test_load_img_url_timeout" + python3 -m pytest -rsf --make-reports=$TEST_CASE tests/utils -k "$pattern" || \ + (echo "FAILED_CASES=$FAILED_CASES,$TEST_CASE" >> $GITHUB_ENV) + - name: Check for errors in tests + run: | + FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//') + echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]" + test -z "$FAILED_CASES" - name: Print results table if: ${{ ! cancelled() }} run: | @@ -160,18 +223,19 @@ jobs: cd transformers { echo "### Results" - echo "| Test group | Errors | Failed | Passed | Skipped |" - echo "| --- | --- | --- | --- | --- |" + echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |" + echo "| --- | --- | --- | --- | --- | --- |" for stat in $(find reports -name stats.txt); do # Each stat.txt is located in: reports/$test_group/stats.txt test_group=$(echo $stat | cut -f 2 -d/) # Get failed, passed, skipped, etc. counters failed=$(parse_stat $stat failed) passed=$(parse_stat $stat passed) + deselected=$(parse_stat $stat deselected) skipped=$(parse_stat $stat skipped) warnings=$(parse_stat $stat warnings) errors=$(parse_stat $stat errors) - echo "| $test_group | $errors | $failed | $passed | $skipped |" + echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |" done } >> $GITHUB_STEP_SUMMARY - name: Print failure lines @@ -180,24 +244,27 @@ jobs: cd transformers { echo "### Failure lines" - echo "| File | Error | Comment |" - echo "| --- | --- | --- |" + echo "| Test group |File | Error | Comment |" + echo "| --- | --- | --- | --- |" rm -rf _failures.txt for failure in $(find reports -name failures_line.txt); do - tail -n +2 $failure >> _failures.txt + # Each failure_line.txt is located in: reports/$test_group/failure_line.txt + test_group=$(echo $failure | cut -f2 -d/) + tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt done # failures_line.txt file does not have test case information, # so we can just sort the output and report uniq values sort _failures.txt | uniq > _failures_uniq.txt while read line; do - file=$(echo $line | cut -f1 -d" " | sed "s/\(.*\):$/\1/") - error=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") + test_group=$(echo $line | cut -f1 -d" ") + file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") + error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/") # Failure comments often contain special characters which complicate # parsing failure lines. But fortunately we know for sure where comments # start. So we just output all contents starting from this position and # wrap everything in
 to avoid collisions with Markdown formatting.
-              comment="
$(echo $line | cut -f3- -d' ' | sed 's/\(.*\):$/\1/')
" - echo "| $file | $error | $comment |" + comment="
$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')
" + echo "| $test_group | $file | $error | $comment |" done <_failures_uniq.txt } >> $GITHUB_STEP_SUMMARY - name: Print annotations From bc99386b32af44a1122be154e4689f72d4d0fbef Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Tue, 24 Dec 2024 16:22:50 +0800 Subject: [PATCH 5/7] Apply new tolerance modification patch for E2E (#1203) Fixing building error related to the patch: https://github.com/pytorch/pytorch/pull/129735. Apply https://github.com/pytorch/pytorch/pull/143739 for replacement. --- .github/scripts/apply_torch_pr.py | 3 +-- src/ATen/native/transformers/SDPUtils.cpp | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/scripts/apply_torch_pr.py b/.github/scripts/apply_torch_pr.py index 89fa32fdf..bbe89ed7d 100644 --- a/.github/scripts/apply_torch_pr.py +++ b/.github/scripts/apply_torch_pr.py @@ -12,8 +12,7 @@ # Fallback to CPU for XPU FP64 "https://github.com/pytorch/pytorch/pull/126516", # Modify the tolerance level in TIMM benchmark - # "https://github.com/pytorch/pytorch/pull/129735", - "https://github.com/mengfei25/pytorch/pull/21", + "https://github.com/pytorch/pytorch/pull/143739", ] ) parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[]) diff --git a/src/ATen/native/transformers/SDPUtils.cpp b/src/ATen/native/transformers/SDPUtils.cpp index db4409493..eca5f9829 100644 --- a/src/ATen/native/transformers/SDPUtils.cpp +++ b/src/ATen/native/transformers/SDPUtils.cpp @@ -4,6 +4,8 @@ namespace sdp { +using c10::array_of; + bool check_all_tensors_on_device(sdp_params const& params, bool debug) { // Check that all tensors are on the GPU device // This should be handled by the stub dispatch, but whe call From 0f48ac07e42ce30d2d07447f4b49bb4ab23f8e64 Mon Sep 17 00:00:00 2001 From: "Cheng, Penghui" Date: Wed, 25 Dec 2024 09:10:36 +0800 Subject: [PATCH 6/7] Add skip lists for LNL, BGM and MTL devices (#1187) Add skip lists for LNL, BGM and MTL devices --------- Signed-off-by: Cheng, Penghui Signed-off-by: Cheng --- test/xpu/extended/run_test_with_skip_bmg.py | 22 ++++++++++++ test/xpu/extended/run_test_with_skip_lnl.py | 22 ++++++++++++ test/xpu/extended/run_test_with_skip_mtl.py | 22 ++++++++++++ test/xpu/extended/skip_list_win_bmg.py | 13 +++++++ test/xpu/extended/skip_list_win_lnl.py | 13 +++++++ test/xpu/extended/skip_list_win_mtl.py | 20 +++++++++++ test/xpu/run_test_with_skip_bmg.py | 24 +++++++++++++ test/xpu/run_test_with_skip_lnl.py | 24 +++++++++++++ test/xpu/skip_list_win_bmg.py | 39 +++++++++++++++++++++ test/xpu/skip_list_win_lnl.py | 38 ++++++++++++++++++++ 10 files changed, 237 insertions(+) create mode 100644 test/xpu/extended/run_test_with_skip_bmg.py create mode 100644 test/xpu/extended/run_test_with_skip_lnl.py create mode 100644 test/xpu/extended/run_test_with_skip_mtl.py create mode 100644 test/xpu/extended/skip_list_win_bmg.py create mode 100644 test/xpu/extended/skip_list_win_lnl.py create mode 100644 test/xpu/extended/skip_list_win_mtl.py create mode 100644 test/xpu/run_test_with_skip_bmg.py create mode 100644 test/xpu/run_test_with_skip_lnl.py create mode 100644 test/xpu/skip_list_win_bmg.py create mode 100644 test/xpu/skip_list_win_lnl.py diff --git a/test/xpu/extended/run_test_with_skip_bmg.py b/test/xpu/extended/run_test_with_skip_bmg.py new file mode 100644 index 000000000..6499550f5 --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_bmg.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_bmg import skip_dict as skip_dict_win_bmg + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_bmg["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) diff --git a/test/xpu/extended/run_test_with_skip_lnl.py b/test/xpu/extended/run_test_with_skip_lnl.py new file mode 100644 index 000000000..a795ca07a --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_lnl.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_lnl import skip_dict as skip_dict_win_lnl + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_lnl["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) diff --git a/test/xpu/extended/run_test_with_skip_mtl.py b/test/xpu/extended/run_test_with_skip_mtl.py new file mode 100644 index 000000000..6ed39a64e --- /dev/null +++ b/test/xpu/extended/run_test_with_skip_mtl.py @@ -0,0 +1,22 @@ +import os +import pytest +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_mtl import skip_dict as skip_dict_win_mtl + +IS_WINDOWS = sys.platform == "win32" + +skip_list = skip_dict["test_ops_xpu.py"] +if IS_WINDOWS: + skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_mtl["test_ops_xpu.py"] + +skip_options = "not " + skip_list[0] +for skip_case in skip_list[1:]: + skip_option = " and not " + skip_case + skip_options += skip_option + +os.environ["PYTORCH_TEST_WITH_SLOW"]="1" +test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] +res = pytest.main(test_command) +sys.exit(res) \ No newline at end of file diff --git a/test/xpu/extended/skip_list_win_bmg.py b/test/xpu/extended/skip_list_win_bmg.py new file mode 100644 index 000000000..2ee1dd31e --- /dev/null +++ b/test/xpu/extended/skip_list_win_bmg.py @@ -0,0 +1,13 @@ +skip_dict = { + "test_ops_xpu.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1173 + # Fatal Python error: Illegal instruction + "test_compare_cpu_grid_sampler_2d_xpu_float64", + "test_compare_cpu_cosh_xpu_complex64", + "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", + "test_compare_cpu_nn_functional_softshrink_xpu_float16", + "test_compare_cpu_nn_functional_softshrink_xpu_float32", + "test_compare_cpu_nn_functional_softshrink_xpu_float64", + "test_compare_cpu_square_xpu_complex128", + ), +} diff --git a/test/xpu/extended/skip_list_win_lnl.py b/test/xpu/extended/skip_list_win_lnl.py new file mode 100644 index 000000000..2ee1dd31e --- /dev/null +++ b/test/xpu/extended/skip_list_win_lnl.py @@ -0,0 +1,13 @@ +skip_dict = { + "test_ops_xpu.py": ( + # https://github.com/intel/torch-xpu-ops/issues/1173 + # Fatal Python error: Illegal instruction + "test_compare_cpu_grid_sampler_2d_xpu_float64", + "test_compare_cpu_cosh_xpu_complex64", + "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", + "test_compare_cpu_nn_functional_softshrink_xpu_float16", + "test_compare_cpu_nn_functional_softshrink_xpu_float32", + "test_compare_cpu_nn_functional_softshrink_xpu_float64", + "test_compare_cpu_square_xpu_complex128", + ), +} diff --git a/test/xpu/extended/skip_list_win_mtl.py b/test/xpu/extended/skip_list_win_mtl.py new file mode 100644 index 000000000..b0d971c6e --- /dev/null +++ b/test/xpu/extended/skip_list_win_mtl.py @@ -0,0 +1,20 @@ +skip_dict = { + # failed on MTL windows, skip first for Preci + "test_ops_xpu.py": ( + "test_compare_cpu_sqrt_xpu_complex64", + "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32", + + "test_compare_cpu_cosh_xpu_complex128", + "test_compare_cpu_frexp_xpu_bfloat16", + "test_compare_cpu_frexp_xpu_float16", + "test_compare_cpu_frexp_xpu_float32", + "test_compare_cpu_frexp_xpu_float64", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_bfloat16", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float16", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float32", + "test_compare_cpu_max_pool2d_with_indices_backward_xpu_float64", + "test_compare_cpu_nn_functional_avg_pool2d_xpu_bfloat16", + "test_compare_cpu_nn_functional_avg_pool2d_xpu_float32", + "test_compare_cpu_nn_functional_avg_pool3d_xpu_float32", + ), +} diff --git a/test/xpu/run_test_with_skip_bmg.py b/test/xpu/run_test_with_skip_bmg.py new file mode 100644 index 000000000..9bd360296 --- /dev/null +++ b/test/xpu/run_test_with_skip_bmg.py @@ -0,0 +1,24 @@ +import os +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_bmg import skip_dict as skip_dict_win_bmg +from xpu_test_utils import launch_test + + +res = 0 +IS_WINDOWS = sys.platform == "win32" + +for key in skip_dict: + skip_list = skip_dict[key] + if IS_WINDOWS and key in skip_dict_win: + skip_list += skip_dict_win[key] + if IS_WINDOWS and key in skip_dict_win_bmg: + skip_list += skip_dict_win_bmg[key] + res += launch_test(key, skip_list) + +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/run_test_with_skip_lnl.py b/test/xpu/run_test_with_skip_lnl.py new file mode 100644 index 000000000..4413626ea --- /dev/null +++ b/test/xpu/run_test_with_skip_lnl.py @@ -0,0 +1,24 @@ +import os +import sys +from skip_list_common import skip_dict +from skip_list_win import skip_dict as skip_dict_win +from skip_list_win_lnl import skip_dict as skip_dict_win_lnl +from xpu_test_utils import launch_test + + +res = 0 +IS_WINDOWS = sys.platform == "win32" + +for key in skip_dict: + skip_list = skip_dict[key] + if IS_WINDOWS and key in skip_dict_win: + skip_list += skip_dict_win[key] + if IS_WINDOWS and key in skip_dict_win_lnl: + skip_list += skip_dict_win_lnl[key] + res += launch_test(key, skip_list) + +if os.name == "nt": + sys.exit(res) +else: + exit_code = os.WEXITSTATUS(res) + sys.exit(exit_code) \ No newline at end of file diff --git a/test/xpu/skip_list_win_bmg.py b/test/xpu/skip_list_win_bmg.py new file mode 100644 index 000000000..a91d4f4a5 --- /dev/null +++ b/test/xpu/skip_list_win_bmg.py @@ -0,0 +1,39 @@ +skip_dict = { + # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j) + "test_unary_ufuncs_xpu.pyy": ( + "test_reference_numerics_small_acos_xpu_complex32", + "test_reference_numerics_small_asin_xpu_complex32", + "test_reference_numerics_small_asinh_xpu_complex32", + "test_reference_numerics_small_atan_xpu_complex32", + "test_reference_numerics_small_atanh_xpu_complex32", + # Need to check compiler std::sin() on inf+infj + "test_reference_numerics_extremal__refs_sin_xpu_complex128", + "test_reference_numerics_extremal__refs_sin_xpu_complex64", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64", + "test_reference_numerics_extremal_sin_xpu_complex128", + "test_reference_numerics_extremal_sin_xpu_complex64", + "test_reference_numerics_extremal_sinh_xpu_complex128", + "test_reference_numerics_extremal_sinh_xpu_complex64", + "test_reference_numerics_large__refs_sin_xpu_complex32", + "test_reference_numerics_large_sin_xpu_complex32", + # Known issue of exp accuracy + # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj) + "test_reference_numerics_large__refs_exp_xpu_complex128", + "test_reference_numerics_large_exp_xpu_complex128", + "test_reference_numerics_small_exp_xpu_complex32", + ":test_reference_numerics_normal_special_i1_xpu_float32", + "test_reference_numerics_normal_sigmoid_xpu_complex32", + "test_reference_numerics_small_sigmoid_xpu_complex32", + ), + # https://github.com/intel/torch-xpu-ops/issues/1171 + # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found + "nn\test_pooling_xpu.py": ( + "test_MaxUnpool_index_errors_case1_xpu", + "test_MaxUnpool_index_errors_case2_xpu", + "test_MaxUnpool_index_errors_case4_xpu", + "test_MaxUnpool_index_errors_case6_xpu", + "test_MaxUnpool_index_errors_case7_xpu", + "test_MaxUnpool_index_errors_case9_xpu", + ), +} diff --git a/test/xpu/skip_list_win_lnl.py b/test/xpu/skip_list_win_lnl.py new file mode 100644 index 000000000..a9e8bfc3f --- /dev/null +++ b/test/xpu/skip_list_win_lnl.py @@ -0,0 +1,38 @@ +skip_dict = { + # tensor(0.-0.j, device='xpu:0', dtype=torch.complex32) tensor(nan+nanj, device='xpu:0', dtype=torch.complex32) (1.5707964+0j) + "test_unary_ufuncs_xpu.pyy": ( + "test_reference_numerics_small_acos_xpu_complex32", + "test_reference_numerics_small_asin_xpu_complex32", + "test_reference_numerics_small_asinh_xpu_complex32", + "test_reference_numerics_small_atan_xpu_complex32", + "test_reference_numerics_small_atanh_xpu_complex32", + # Need to check compiler std::sin() on inf+infj + "test_reference_numerics_extremal__refs_sin_xpu_complex128", + "test_reference_numerics_extremal__refs_sin_xpu_complex64", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex128", + "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64", + "test_reference_numerics_extremal_sin_xpu_complex128", + "test_reference_numerics_extremal_sin_xpu_complex64", + "test_reference_numerics_extremal_sinh_xpu_complex128", + "test_reference_numerics_extremal_sinh_xpu_complex64", + "test_reference_numerics_large__refs_sin_xpu_complex32", + "test_reference_numerics_large_sin_xpu_complex32", + # Known issue of exp accuracy + # tensor(13437.7000-501.j, device='xpu:0', dtype=torch.complex128) tensor(inf+infj, device='xpu:0', dtype=torch.complex128) (-inf+infj) + "test_reference_numerics_large__refs_exp_xpu_complex128", + "test_reference_numerics_large_exp_xpu_complex128", + "test_reference_numerics_small_exp_xpu_complex32", + ":test_reference_numerics_normal_special_i1_xpu_float32", + "test_reference_numerics_normal_sigmoid_xpu_complex32", + ), + # https://github.com/intel/torch-xpu-ops/issues/1171 + # AssertionError: 'Assertion maxind >= 0 && maxind < outputImageSize failed' not found in '\nAssertHandler::printMessage\n' : The expected error was not found + "nn\test_pooling_xpu.py": ( + "test_MaxUnpool_index_errors_case1_xpu", + "test_MaxUnpool_index_errors_case2_xpu", + "test_MaxUnpool_index_errors_case4_xpu", + "test_MaxUnpool_index_errors_case6_xpu", + "test_MaxUnpool_index_errors_case7_xpu", + "test_MaxUnpool_index_errors_case9_xpu", + ), +} From 7d66fe1150223dc77fb65b994579d14f2bd96402 Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Wed, 25 Dec 2024 18:24:15 -0800 Subject: [PATCH 7/7] ci/transformers: dump xpu-smi output in the log (#1206) Signed-off-by: Dmitry Rogozhkin Co-authored-by: Cheng, Penghui --- .github/workflows/_linux_transformers.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index f79227658..a539260a6 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -112,6 +112,8 @@ jobs: lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt echo "GPU render nodes:" cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt + echo "xpu-smi output:" + xpu-smi discovery -y --json --dump -1 - name: Sanitry check installed packages run: | source activate huggingface_transformers_test