diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 4fd192c06..9cfd67477 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -4,6 +4,8 @@ if [ "$1" != "nightly_wheel" ];then source /opt/intel/oneapi/compiler/latest/env/vars.sh source /opt/intel/oneapi/umf/latest/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh + source /opt/intel/oneapi/ccl/latest/env/vars.sh + source /opt/intel/oneapi/mpi/latest/env/vars.sh else echo "Don't need to source DL-Essential for nightly wheel" fi diff --git a/.github/workflows/_linux_transformers.yml b/.github/workflows/_linux_transformers.yml index fd099fcb6..95aee8e7e 100644 --- a/.github/workflows/_linux_transformers.yml +++ b/.github/workflows/_linux_transformers.yml @@ -50,6 +50,7 @@ jobs: DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }} python: ${{ inputs.python != '' && inputs.python || '3.10' }} pytorch: ${{ inputs.pytorch != '' && inputs.pytorch || 'nightly' }} + transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' steps: - name: Checkout torch-xpu-ops @@ -60,7 +61,7 @@ jobs: uses: actions/checkout@v4 with: repository: huggingface/transformers - ref: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.0' }} + ref: ${{ env.transformers }} path: transformers - name: Prepare OS environment run: | @@ -106,12 +107,54 @@ jobs: id: installed run: | source activate huggingface_transformers_test - echo "TORCH_BRANCH_ID=$(python -c 'import torch; print(torch.__version__)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "pip installed packages:" pip list | tee ${{ github.workspace }}/transformers/tests_log/pip_list.txt + echo "lspci gpu devices:" + lspci -d ::0380 | tee ${{ github.workspace }}/transformers/tests_log/lspci_0380.txt echo "GPU render nodes:" cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt + # printing annotations for the key packages + echo "### Annotations" >> $GITHUB_STEP_SUMMARY + echo "| | |" >> $GITHUB_STEP_SUMMARY + echo "| --- | --- |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" >> $GITHUB_STEP_SUMMARY + packages=" \ + level-zero \ + libigc1 \ + libigc2 \ + libze1 \ + libze-intel-gpu1 \ + intel-i915-dkms \ + intel-level-zero-gpu \ + intel-opencl-icd" + for package in $packages; do + package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY + done + packages="accelerate \ + numpy \ + torch \ + torchaudio \ + torchvision \ + transformers" + for package in $packages; do + package_version=$(python -c "import $package; print($package.__version__)") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" >> $GITHUB_STEP_SUMMARY + done + # printing annotations for GPU cards + var="[$(cat /sys/class/drm/render*/device/vendor)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY + var="[$(cat /sys/class/drm/render*/device/device)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed "s/ /,/g") |" >> $GITHUB_STEP_SUMMARY + var=$(python -c "import torch; print(torch.version.xpu)") + echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" >> $GITHUB_STEP_SUMMARY + var=$(python -c "import torch; print(torch.xpu.device_count())") + echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" >> $GITHUB_STEP_SUMMARY + # printing annotations with key environment variables + echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" >> $GITHUB_STEP_SUMMARY + echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" >> $GITHUB_STEP_SUMMARY - name: Sanitry check installed packages run: | source activate huggingface_transformers_test @@ -120,6 +163,7 @@ jobs: pip show torch | grep Version | grep xpu pip show torchaudio | grep Version | grep xpu pip show torchvision | grep Version | grep xpu + python -c 'import torch; exit(not torch.xpu.is_available())' - name: Run XPU backbone run: | source activate huggingface_transformers_test diff --git a/src/ATen/native/xpu/sycl/ResizeKernel.cpp b/src/ATen/native/xpu/sycl/ResizeKernel.cpp index 237a1c213..f1ee7f944 100644 --- a/src/ATen/native/xpu/sycl/ResizeKernel.cpp +++ b/src/ATen/native/xpu/sycl/ResizeKernel.cpp @@ -25,8 +25,9 @@ void resize_bytes_xpu(StorageImpl* storage, size_t size_bytes) { c10::xpu::XPUGuard guard(device.index()); at::DataPtr data = allocator->allocate(size_bytes); if (storage->data_ptr()) { - auto q = at::xpu::getCurrentSYCLQueue(); + at::globalContext().lazyInitDevice(c10::DeviceType::XPU); + auto q = at::xpu::getCurrentSYCLQueue(); q.memcpy( data.get(), storage->data(), std::min(storage->nbytes(), size_bytes)); } diff --git a/src/ATen/xpu/EmptyTensor.cpp b/src/ATen/xpu/EmptyTensor.cpp index 3f5e998f8..6411bb221 100644 --- a/src/ATen/xpu/EmptyTensor.cpp +++ b/src/ATen/xpu/EmptyTensor.cpp @@ -54,6 +54,7 @@ TensorBase empty_strided_xpu( IntArrayRef stride, ScalarType dtype, c10::optional device_opt) { + at::globalContext().lazyInitDevice(c10::DeviceType::XPU); const auto device = device_or_default(device_opt); TORCH_INTERNAL_ASSERT(device.is_xpu()); const c10::DeviceGuard device_guard(device); diff --git a/test/xpu/extended/skip_list_arc.py b/test/xpu/extended/skip_list_arc.py index e1e701b84..c8e26ccf3 100644 --- a/test/xpu/extended/skip_list_arc.py +++ b/test/xpu/extended/skip_list_arc.py @@ -7,5 +7,21 @@ "test_compare_cpu_bincount_xpu_int64", "test_compare_cpu_bincount_xpu_int8", "test_compare_cpu_bincount_xpu_uint8", + # RuntimeError: Kernel is incompatible with all devices in devs + # https://github.com/intel/torch-xpu-ops/issues/1150 + "test_compare_cpu_logcumsumexp_xpu_float16", + "test_compare_cpu_logcumsumexp_xpu_float32", + "test_compare_cpu_nn_functional_pdist_xpu_float32", + "test_compare_cpu_tril_indices_xpu_int32", + "test_compare_cpu_tril_indices_xpu_int64", + "test_compare_cpu_triu_indices_xpu_int32", + "test_compare_cpu_triu_indices_xpu_int64", + "test_backward_logcumsumexp_xpu_float32", + "test_backward_nn_functional_pdist_xpu_float32", + "test_forward_ad_logcumsumexp_xpu_float32", + "test_operator_logcumsumexp_xpu_float32", + "test_operator_nn_functional_pdist_xpu_float32", + "test_view_replay_logcumsumexp_xpu_float32", + "test_view_replay_nn_functional_pdist_xpu_float32", ), } diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py index 6b5fd653e..643d631eb 100644 --- a/test/xpu/extended/skip_list_common.py +++ b/test/xpu/extended/skip_list_common.py @@ -194,5 +194,9 @@ # Greatest absolute difference: 0.0625 at index (1,) (up to 0.001 allowed) # Greatest relative difference: 0.00640869140625 at index (1,) (up to 0.001 allowed) "test_compare_cpu_xlogy_xpu_bfloat16", + "test_compare_cpu_div_trunc_rounding_xpu_float64", + "test_compare_cpu_div_trunc_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_float16", + "test_compare_cpu_div_floor_rounding_xpu_bfloat16", ), } diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index 670a88f53..52a93d91b 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -649,6 +649,14 @@ "test_python_ref__refs_square_xpu_complex64", "test_python_ref_torch_fallback__refs_square_xpu_complex64", "test_python_ref_torch_fallback__refs_exp_xpu_complex128", + + # Failed on rolling driver, passed on preci + "test_python_ref__refs_div_trunc_rounding_xpu_float64", + "test_python_ref_executor__refs_div_trunc_rounding_executor_aten_xpu_float64", + "test_python_ref_torch_fallback__refs_div_trunc_rounding_xpu_float64", + + # TODO: passed from source code building version, investigate + "test_python_ref__refs_log2_xpu_complex128", ), "test_binary_ufuncs_xpu.py": ( @@ -1136,6 +1144,7 @@ # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed) "test_reference_numerics_normal__refs_asinh_xpu_complex64", "test_reference_numerics_normal_asinh_xpu_complex64", + "test_batch_vs_slicing__refs_sigmoid_xpu_complex128", # Unexpected success: CUDA uses thrust::sqrt and has accuracy issue. XPU use std::sqrt and has no issue. "test_reference_numerics_large_rsqrt_xpu_complex32", # Numeric difference @@ -1514,6 +1523,8 @@ # XPU does not support tunable. "test_bmm_tunableop_rocm_xpu_float32", "test_numeric_check_leak_tunableop_rocm_xpu_float32", + "test_dump_results_on_exit_tunableop_xpu_float32", + "test_rotating_buffer_tunableop_xpu_float32", # CUDA bias cases added in latest PyTorch # AttributeError: module 'torch._C' has no attribute '_cuda_tunableop_enable' "test_matmul_check_entries_tunableop_xpu_float16", @@ -3230,7 +3241,10 @@ "test_type_promotion_xpu.py": None, - "test_distributions_xpu.py": None, + "test_distributions_xpu.py": ( + # TODO: Passed on lts driver version, but failed on rolling driver version + "test_gamma_gpu_sample_xpu", + ), "test_optim_xpu.py": ( # oneDNN issues diff --git a/test/xpu/test_unary_ufuncs_xpu.py b/test/xpu/test_unary_ufuncs_xpu.py index 0e05a8e7c..a6c12a2ad 100644 --- a/test/xpu/test_unary_ufuncs_xpu.py +++ b/test/xpu/test_unary_ufuncs_xpu.py @@ -1,6 +1,7 @@ # Owner(s): ["module: intel"] -from torch.testing._internal.common_device_type import instantiate_device_type_tests +import torch +from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU from torch.testing._internal.common_utils import run_tests try: @@ -11,6 +12,38 @@ with XPUPatchForImport(False): from test_unary_ufuncs import TestUnaryUfuncs + @onlyXPU + def _nonzero_static_large(self, device): + # large enough to have multiple iters per SM even on H100 + # with 132 sms + size_inp = 1024 * 16 * 132 + 1024 * 16 + x = torch.zeros(size_inp, device=device) + # unique indices + indices = torch.randperm(size_inp, device=device)[: size_inp // 2] + sorted, _ = torch.sort(indices) + x[sorted] = 1 + res = torch.nonzero_static(x, size=size_inp // 2).view(-1) + self.assertEqual(res, sorted) + # no oob writes + out = torch.full((size_inp,), 10, device=device, dtype=torch.int64) + res = torch.nonzero_static(x, size=size_inp // 4, out=out[: size_inp // 2]) + self.assertEqual(out[: size_inp // 4], sorted[: size_inp // 4]) + self.assertEqual( + out[size_inp // 4 :], + torch.tensor(10, device="xpu").expand_as(out[size_inp // 4 :]), + ) + # correct fill for 2d + x = x.view(2, size_inp // 2) + ref = x.nonzero() + res = x.nonzero_static(size=size_inp // 2 + 2) + self.assertEqual(res.shape, [size_inp // 2 + 2, 2]) + self.assertEqual(ref, res[: size_inp // 2]) + self.assertEqual( + res[size_inp // 2 :], + torch.tensor(-1, device="xpu").expand_as(res[size_inp // 2 :]), + ) + TestUnaryUfuncs.test_nonzero_static_large = _nonzero_static_large + instantiate_device_type_tests(TestUnaryUfuncs, globals(),only_for=("xpu"), allow_xpu=True) if __name__ == "__main__":