From 201a29bd8fefe8864784fe360594956db8cee1c3 Mon Sep 17 00:00:00 2001 From: majing Date: Wed, 24 Jul 2024 08:18:22 +0800 Subject: [PATCH 1/8] Update pin_memory implementation follow stock pytorch change (#639) PyTorch implemented device agnostic aten::is_pinned and aten::_pin_memory in latest main. https://github.com/pytorch/pytorch/commit/8963623494d096e3e5d5038aaedeb0a4fa2b3b57 Signed-off-by: majing --- src/ATen/native/xpu/PinnedMemoryAllocator.cpp | 39 ------------------- test/xpu/test_torch_xpu.py | 30 ++++---------- yaml/xpu_functions.yaml | 2 - 3 files changed, 8 insertions(+), 63 deletions(-) delete mode 100644 src/ATen/native/xpu/PinnedMemoryAllocator.cpp diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp deleted file mode 100644 index 8d04d93e1..000000000 --- a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include -#include -#include - -namespace at { - -// Note: The user must call is_pinned(device='xpu') to explicitly call here. -bool XPUNativeFunctions::is_pinned( - const Tensor& self, - c10::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - !device.has_value() || device->type() == c10::DeviceType::XPU); - - return at::detail::getXPUHooks().isPinnedPtr(self.storage().data()); -} - -// Note: The user must call tensor.pin_memory(device='xpu') to explicitly call -// here. -Tensor XPUNativeFunctions::_pin_memory( - const Tensor& self, - c10::optional device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - !device.has_value() || device->type() == c10::DeviceType::XPU); - - auto* allocator = at::xpu::getPinnedMemoryAllocator(); - auto storage = c10::Storage( - c10::Storage::use_byte_size_t(), - at::detail::computeStorageNbytes( - self.sizes(), self.strides(), self.dtype().itemsize()), - allocator, - /*resizable=*/false); - auto tensor = at::cpu::empty({0}, self.options()) - .set_(storage, 0, self.sizes(), self.strides()); - tensor.copy_(self); - return tensor; -} - -} // namespace at diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py index afb4b28a8..b82a8ec67 100644 --- a/test/xpu/test_torch_xpu.py +++ b/test/xpu/test_torch_xpu.py @@ -8572,31 +8572,17 @@ def test_new(self) -> None: # TypeError would be better self.assertRaises(RuntimeError, lambda: x.new(z.storage())) - @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property") def test_pin_memory(self): x = torch.randn(3, 5) self.assertFalse(x.is_pinned()) - if not torch.cuda.is_available() or not torch.xpu.is_available(): - self.assertRaises(RuntimeError, lambda: x.pin_memory()) - else: - if torch.xpu.is_available(): - device = 'xpu' - self.assertFalse(x.is_pinned(device)) - pinned = x.pin_memory(device) - self.assertTrue(pinned.is_pinned(device)) - self.assertEqual(pinned, x) - self.assertNotEqual(pinned.data_ptr(), x.data_ptr()) - # test that pin_memory on already pinned tensor has no effect - self.assertIs(pinned, pinned.pin_memory(device)) - self.assertEqual(pinned.data_ptr(), pinned.pin_memory(device).data_ptr()) - else: - pinned = x.pin_memory() - self.assertTrue(pinned.is_pinned()) - self.assertEqual(pinned, x) - self.assertNotEqual(pinned.data_ptr(), x.data_ptr()) - # test that pin_memory on already pinned tensor has no effect - self.assertIs(pinned, pinned.pin_memory()) - self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr()) + if torch.cuda.is_available() or torch.xpu.is_available(): + pinned = x.pin_memory() + self.assertTrue(pinned.is_pinned()) + self.assertEqual(pinned, x) + self.assertNotEqual(pinned.data_ptr(), x.data_ptr()) + # test that pin_memory on already pinned tensor has no effect + self.assertIs(pinned, pinned.pin_memory()) + self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr()) diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 54c5c31ff..cd6f80b84 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -539,8 +539,6 @@ supported: - sgn.out - sgn_ - _cdist_forward - - _pin_memory - - is_pinned - is_set_to - bucketize.Tensor - bucketize.Tensor_out From fbd095a57bb146c1360e2dfe5c8cd7ef615567ae Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Tue, 23 Jul 2024 20:04:00 -0700 Subject: [PATCH 2/8] Drop CPU fallback for aten::norm.out and aten::nextafter.out (#641) Fixes: #640 Fixes: 6eca394 ("Add aten::nextafter and its variants") CC: @fengyuan14 @yucai-intel @xytintel Signed-off-by: Dmitry Rogozhkin --- src/ATen/native/xpu/XPUFallback.template | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index d5ae19d5a..52b417415 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -239,8 +239,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "nanmedian", "nanmedian.dim_values", "nansum", - "norm.out", - "nextafter.out", "ormqr", "_pdist_backward", "_pdist_forward", From 07ebadbd582305ca4dac2759358e8be651979df4 Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Wed, 24 Jul 2024 11:05:17 +0800 Subject: [PATCH 3/8] Update the comments of the radix sort kernel (#633) Update the comments of the radix sort kernel. Co-authored-by: Feng Yuan --- src/ATen/native/xpu/sycl/SortingKernels.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h index fb29a9f53..b39595143 100644 --- a/src/ATen/native/xpu/sycl/SortingKernels.h +++ b/src/ATen/native/xpu/sycl/SortingKernels.h @@ -539,9 +539,7 @@ void segmented_sort_pairs_( int num_elements) { constexpr int scaling_coef = sizeof(key_t) * sizeof(value_t) >= 64 ? 2 - : 1; // Attempt to reduce register pressure. The result will be incorrect - // when using too many local variables (registers). - // https://github.com/intel/torch-xpu-ops/issues/626 + : 1; // Attempt to reduce register pressure for performance. if (num_elements > 4096 / scaling_coef) { // Considering register pressure, we use a problem size of 4096 to delineate // the boundary between single tile sort and group sort. From f716a582852f562e2ad26a0381885b0509168fe9 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Wed, 24 Jul 2024 13:54:54 +0800 Subject: [PATCH 4/8] Add aten::repeat_interleave (#564) ![image](https://github.com/intel/torch-xpu-ops/assets/105194461/1fe5b8bc-c82a-41eb-b673-7c5cb6f5d3cf) --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/ReduceOps.cpp | 1 - src/ATen/native/xpu/Repeat.cpp | 10 +++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/RepeatKernel.cpp | 79 +++++++++++++++++++++++ src/ATen/native/xpu/sycl/RepeatKernel.h | 9 +++ test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 1 + 7 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 src/ATen/native/xpu/Repeat.cpp create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.cpp create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.h diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp index 99069eefb..826b1e27b 100644 --- a/src/ATen/native/xpu/ReduceOps.cpp +++ b/src/ATen/native/xpu/ReduceOps.cpp @@ -47,7 +47,6 @@ static void cum_ops_meta( maybe_wrap_dim(dim, self.dim()); ScalarType out_dtype; - if (result.defined()) { out_dtype = dtype.value_or(result.scalar_type()); at::xpu::resize_out( diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp new file mode 100644 index 000000000..38e5ae8da --- /dev/null +++ b/src/ATen/native/xpu/Repeat.cpp @@ -0,0 +1,10 @@ +#include +#include +#include +namespace at { +Tensor XPUNativeFunctions::repeat_interleave( + const Tensor& repeats, + c10::optional output_size) { + return at::native::xpu::repeat_interleave_kernel(repeats, output_size); +} +} // namespace at diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 52b417415..356a45f70 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -248,7 +248,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "prod", "prod.int_out", "put_", - "repeat_interleave.Tensor", "replication_pad1d_backward.grad_input", "replication_pad1d.out", "replication_pad2d_backward", diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp new file mode 100644 index 000000000..c3e27ab62 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +namespace at::native::xpu { +template +struct RepeatInterleaveKernelFunctor { + void operator()(sycl::nd_item<1> item) const { + auto rep_ptr = rep_data_; + auto cum_ptr = cum_data_; + auto res_ptr = res_data_; + + for (int64_t i = item.get_global_id(0); i < size_; + i += item.get_global_range()[0]) { + int64_t end = cum_ptr[i]; + int64_t repeat = rep_ptr[i]; + int64_t start = end - repeat; + for (int64_t j = start; j < end; j++) { + res_ptr[j] = i; + } + } + } + RepeatInterleaveKernelFunctor( + const index_t* rep_data, + const int64_t* cum_data, + index_t* res_data, + int64_t size, + int64_t result_size) + : rep_data_(rep_data), + cum_data_(cum_data), + res_data_(res_data), + size_(size), + result_size_(result_size) {} + + private: + const index_t* rep_data_; + const int64_t* cum_data_; + index_t* res_data_; + int64_t size_; + int64_t result_size_; +}; + +template +static void compute_xpu( + const index_t* repeat_ptr, + const int64_t* cumsum_ptr, + index_t* result_ptr, + int64_t size, + int64_t result_size) { + if (size == 0) + return; + + auto kfn = RepeatInterleaveKernelFunctor( + repeat_ptr, + cumsum_ptr, + result_ptr, + size, + result_size); + + int64_t wg_size = syclMaxWorkGroupSize(kfn); + int64_t local_range = size < wg_size ? size : wg_size; + int64_t global_range = ((size + local_range - 1) / local_range) * local_range; + + auto queue = getCurrentSYCLQueue(); + sycl_kernel_submit(global_range, local_range, queue, kfn); +} + +Tensor repeat_interleave_kernel( + const Tensor& repeat, + c10::optional output_size) { + Tensor output; + + AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_xpu", [&] { + output = repeat_interleave_common>( + repeat, output_size); + }); + return output; +} +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h new file mode 100644 index 000000000..3df41a956 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RepeatKernel.h @@ -0,0 +1,9 @@ +#pragma once +#include +namespace at::native::xpu { + +Tensor repeat_interleave_kernel( + const Tensor& repeats, + c10::optional output_size); + +} // namespace at::native::xpu \ No newline at end of file diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 3e3b5d4d1..96e43058a 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -209,6 +209,7 @@ "aminmax", "argmin", "conj_physical", + "repeat_interleave", "fmax", "fmin", "floor", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index cd6f80b84..48ddc4c3f 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -660,6 +660,7 @@ supported: - ceil - ceil_ - ceil.out + - repeat_interleave.Tensor - norm.ScalarOpt_dim_dtype - norm.dtype_out - norm.ScalarOpt_dim From d25d3d09a2a01867bf613f9cc535c596832eb2e8 Mon Sep 17 00:00:00 2001 From: yuchengliu1 Date: Wed, 24 Jul 2024 14:42:30 +0800 Subject: [PATCH 5/8] pow_Tensor_Scalar: fixing error result when scalar exp is a bool. (#638) Fixing TestBinaryUfuncsXPU.test_pow_xpu_int --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Pow.cpp | 8 +++++++- test/xpu/run_test_with_skip.py | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp index df335bfa1..97dc5a0c2 100644 --- a/src/ATen/native/xpu/Pow.cpp +++ b/src/ATen/native/xpu/Pow.cpp @@ -58,7 +58,13 @@ Tensor& XPUNativeFunctions::pow_out( Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) { Tensor out; auto iter = pow_tensor_scalar_meta(self, exponent, out); - native::xpu::pow_tensor_scalar_kernel(iter, exponent); + if (exponent.equal(0.0) || exponent.equal(false)) { + iter.output().fill_(1); + } else if (exponent.equal(1.0) || exponent.equal(true)) { + iter.output().copy_(self); + } else { + native::xpu::pow_tensor_scalar_kernel(iter, exponent); + } return iter.output(); } diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 0e00da513..3ffc3feb6 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -797,12 +797,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_list = ( "test_fmod_remainder_by_zero_integral_xpu_int64", # zero division is an undefined behavior: different handles on different backends "test_div_rounding_numpy_xpu_float16", # Calculation error. XPU implementation uses opmath type. - # RuntimeError: false INTERNAL ASSERT FAILED at "torch-xpu-ops/src/ATen/native/xpu/sycl/PowKernels.cpp":233, please report a bug to PyTorch. invalid combination of type in Pow function, common dtype: Short, exp is integral? 0 + # fail in complex_exponents=[-1.0 - 1.5j, 3.3j] + # Mismatched elements: 33 / 100 (33.0%) + # Greatest absolute difference: 0.00038337233127094805 at index (4,) (up to 1e-05 allowed) + # Greatest relative difference: 1.9085073290625587e-06 at index (6,) (up to 1.3e-06 allowed) "test_pow_xpu_int16", "test_pow_xpu_int32", "test_pow_xpu_int64", - "test_pow_xpu_int8", - "test_pow_xpu_uint8", # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available. "_jiterator_", # Unexpected success From e1e195bd4d6f9541fc05c54579242f07f30a63b3 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:55:52 +0800 Subject: [PATCH 6/8] Support XPU ABI=0 build (#531) # Motivation Support XPU ABI neutral build. Starting from compiler 2025.0, `libsycl.so` is ABI-neutral lib. Before this, the `libsycl-preview.so` is ABI-neutral. refer to https://github.com/pytorch/pytorch/pull/130110 --- cmake/BuildFlags.cmake | 6 +- cmake/Modules/FindSYCL.cmake | 5 +- cmake/Modules/FindSYCLToolkit.cmake | 73 ++++++------------------- cmake/SYCL.cmake | 18 ------ src/ATen/CMakeLists.txt | 2 +- src/ATen/native/xpu/sycl/AmpKernels.cpp | 4 +- src/CMakeLists.txt | 2 +- test/sycl/CMakeLists.txt | 2 +- 8 files changed, 28 insertions(+), 84 deletions(-) diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index d0eb3e299..2f05d1ab0 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -59,10 +59,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func) set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value) set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz) + # Equivalent to build option -fpreview-breaking-changes for SYCL compiler. + set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES) + set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}) endif() - # TODO: Align with PyTorch and switch to ABI=0 eventually, after - # resolving incompatible implementation in SYCL runtime. - set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1) set(SYCL_FLAGS ${SYCL_FLAGS} ${SYCL_KERNEL_OPTIONS}) set(TORCH_XPU_OPS_FLAGS ${SYCL_HOST_FLAGS}) diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake index b8fc8ba22..4e266a26f 100644 --- a/cmake/Modules/FindSYCL.cmake +++ b/cmake/Modules/FindSYCL.cmake @@ -407,7 +407,6 @@ macro(SYCL_LINK_DEVICE_OBJECTS output_file sycl_target) OUTPUT ${output_file} DEPENDS ${object_files} COMMAND ${SYCL_EXECUTABLE} - -fsycl ${SYCL_device_link_flags} -fsycl-link ${object_files} -Xs "\"${SYCL_OFFLINE_COMPILER_FLAGS}\"" @@ -471,7 +470,7 @@ macro(SYCL_ADD_LIBRARY sycl_target) target_link_libraries( ${sycl_target} ${SYCL_LINK_LIBRARIES_KEYWORD} - ${SYCL_LIBRARIES}) + ${SYCL_LIBRARY}) set_target_properties(${sycl_target} PROPERTIES @@ -530,7 +529,7 @@ macro(SYCL_ADD_EXECUTABLE sycl_target) target_link_libraries( ${sycl_target} ${SYCL_LINK_LIBRARIES_KEYWORD} - ${SYCL_LIBRARIES}) + ${SYCL_LIBRARY}) set_target_properties(${sycl_target} PROPERTIES diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake index e478cb4c5..46e34c7f8 100644 --- a/cmake/Modules/FindSYCLToolkit.cmake +++ b/cmake/Modules/FindSYCLToolkit.cmake @@ -25,16 +25,20 @@ This will define the following variables: #]=======================================================================] -set(SYCLTOOLKIT_FOUND False) -include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +include(${TORCH_ROOT}/cmake/Modules/FindSYCLToolkit.cmake) -set(SYCL_ROOT "") -if(DEFINED ENV{SYCL_ROOT}) - set(SYCL_ROOT $ENV{SYCL_ROOT}) -elseif(DEFINED ENV{CMPLR_ROOT}) - set(SYCL_ROOT $ENV{CMPLR_ROOT}) +if(NOT SYCL_FOUND) + set(SYCLTOOLKIT_FOUND FALSE) + return() endif() +if(SYCLTOOLKIT_FOUND) + return() +endif() +set(SYCLTOOLKIT_FOUND TRUE) + +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) + if(WIN32) set(SYCL_EXECUTABLE_NAME icx) else() @@ -71,43 +75,6 @@ if(nocmplr) set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") endif() -find_file( - SYCL_INCLUDE_DIR - NAMES include - HINTS ${SYCL_ROOT} - NO_DEFAULT_PATH - ) - -find_file( - SYCL_INCLUDE_SYCL_DIR - NAMES sycl - HINTS ${SYCL_ROOT}/include - NO_DEFAULT_PATH - ) - -list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR}) - -find_file( - SYCL_LIBRARY_DIR - NAMES lib lib64 - HINTS ${SYCL_ROOT} - NO_DEFAULT_PATH - ) - -find_library( - SYCL_LIBRARY - NAMES sycl - HINTS ${SYCL_LIBRARY_DIR} - NO_DEFAULT_PATH -) - -if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY)) - set(SYCLTOOLKIT_FOUND False) - set(SYCL_REASON_FAILURE "SYCL sdk is incomplete!!") - set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") - return() -endif() - # Function to write a test case to verify SYCL features. function(SYCL_CMPLR_TEST_WRITE src) @@ -202,6 +169,13 @@ set(SYCL_FLAGS "") set(SYCL_LINK_FLAGS "") list(APPEND SYCL_FLAGS "-fsycl") list(APPEND SYCL_LINK_FLAGS "-fsycl") +if(LINUX) + string(REGEX MATCH "libsycl-preview.so" is_abi_neutral ${SYCL_LIBRARY}) + if(is_abi_neutral) + list(APPEND SYCL_FLAGS "-fpreview-breaking-changes") + list(APPEND SYCL_LINK_FLAGS "-fpreview-breaking-changes") + endif() +endif() set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") @@ -249,14 +223,3 @@ endif() message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}") message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}") - -# Avoid module variables conflict due to calling find_package recursively -# e.g. find_package -> add_subdirectory(entering in a sub-project) -> find_package -# find_package_handle_standard_args( -# SYCLToolkit -# FOUND_VAR SYCLTOOLKIT_FOUND -# REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY SYCL_FLAGS -# VERSION_VAR SYCL_LANGUAGE_VERSION -# REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}") -set(SYCLTOOLKIT_FOUND True) - diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake index b2bbadaa4..f0724ef55 100644 --- a/cmake/SYCL.cmake +++ b/cmake/SYCL.cmake @@ -25,24 +25,6 @@ if(NOT SYCL_VERSION) return() endif() -find_library(SYCL_LIBRARIES sycl HINTS ${SYCL_LIBRARY_DIR}) -# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix, -# where the current version of the SYCL runtime is 7. -# Until oneAPI adds support to sycl.lib without the version suffix, -# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts. -# TODO: remove this when sycl.lib is supported on Windows -if(WIN32) - set(sycl_runtime_version 7) - find_library( - SYCL_LIBRARIES - NAMES "sycl${sycl_runtime_version}" - HINTS ${SYCL_LIBRARY_DIR} - ) - if(SYCL_LIBRARIES STREQUAL "SYCL_LIBRARIES-NOTFOUND") - message(FATAL_ERROR "Cannot find a SYCL library on Windows") - endif() -endif() - set(SYCL_COMPILER_VERSION) file(READ ${SYCL_VERSION} version_contents) string(REGEX MATCHALL "__SYCL_COMPILER_VERSION +[0-9]+" VERSION_LINE "${version_contents}") diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index c95e860fa..815ad018f 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,6 +1,6 @@ # ATen XPU sources -file(GLOB xpu_cpp "xpu/*.cpp", "native/xpu/*.cpp", "native/sparse/*.cpp") +file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp") file(GLOB xpu_sycl "native/xpu/sycl/*.cpp") list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp}) diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp index 5e8d258a6..87354eea9 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.cpp +++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp @@ -102,9 +102,9 @@ void amp_foreach_non_finite_check_and_unscale_kernel( } struct AmpUpdateScaleKernelFunctor { - void operator()(sycl::item<1> item) const { + void operator()(sycl::nd_item<1> item) const { // There is only single item/task scheduled. - if (item.get_linear_id() != 0) + if (item.get_global_linear_id() != 0) return; if (*found_inf_) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5e0a8549..fa17491a2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,7 +82,7 @@ foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) - target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARIES}) + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) endforeach() include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake) diff --git a/test/sycl/CMakeLists.txt b/test/sycl/CMakeLists.txt index 0671fbc94..5bbee8643 100644 --- a/test/sycl/CMakeLists.txt +++ b/test/sycl/CMakeLists.txt @@ -42,7 +42,7 @@ add_dependencies(test_sycl_build_archive sycl_simple_kernel_test) # Instead, we use explicit linkage option '--whole-archive', which is required # by linkage of device object modules archived in the static library. Then # explicit linkage configuration of SYCL runtime library is required. -target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARIES}) +target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARY}) if(INSTALL_TEST) install(TARGETS test_sycl_build_archive DESTINATION bin) From fcaa34cb5c740c81fab2126b0a68e5f4c724324a Mon Sep 17 00:00:00 2001 From: majing Date: Thu, 25 Jul 2024 08:45:49 +0800 Subject: [PATCH 7/8] Add aten::histogram and variant (#569) CPU/MPS only ops --------- Signed-off-by: majing Signed-off-by: Ma, Jing1 Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Histogram.cpp | 195 +++++++++++++++ src/ATen/native/xpu/sycl/HistogramKernels.h | 23 ++ .../native/xpu/sycl/HistogramddKernels.cpp | 227 ++++++++++++++++++ test/xpu/extended/run_test_with_skip.py | 5 + test/xpu/extended/test_ops_xpu.py | 6 + test/xpu/run_test_with_skip.py | 4 + test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 4 + 8 files changed, 465 insertions(+) create mode 100644 src/ATen/native/xpu/Histogram.cpp create mode 100644 src/ATen/native/xpu/sycl/HistogramKernels.h create mode 100644 src/ATen/native/xpu/sycl/HistogramddKernels.cpp diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp new file mode 100644 index 000000000..de49952b2 --- /dev/null +++ b/src/ATen/native/xpu/Histogram.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include + +namespace at { + +/* Checks properties of input tensors input, bins, and weight. + */ +void histogramdd_check_inputs( + const Tensor& input, + const Tensor& bins, + const std::optional& weight) { + if (weight.has_value()) { + TORCH_CHECK( + weight->device() == input.device(), + "weight and input need to be on the same device.") + } + auto input_dtype = input.dtype(); + auto bins_dtype = bins.dtype(); + TORCH_CHECK( + input_dtype == bins_dtype, + "torch.histogramdd: input tensor and bins tensors should", + " have the same dtype, but got input with dtype ", + input_dtype, + " and bins with dtype ", + bins_dtype); + + const int64_t bins_dim = bins.dim(); + TORCH_CHECK( + bins_dim == 1, + "torch.histogramdd: bins tensor should have one dimension,", + " but got ", + bins_dim, + " dimensions in the bin tensor"); + + const int64_t numel = bins.numel(); + TORCH_CHECK( + numel > 0, + "torch.histogramdd: bins tensor should have at least 1 element,", + " but got ", + numel, + " elements in the bin tensor"); + + if (weight.has_value()) { + TORCH_CHECK( + input.dtype() == weight.value().dtype(), + "torch.histogramdd: if weight tensor is provided, ", + "input tensor and weight tensor should have the same dtype, ", + "but got input(", + input.dtype(), + ")", + ", and weight(", + weight.value().dtype(), + ")"); + + auto input_sizes = input.sizes().vec(); + + auto weight_sizes = weight.value().sizes().vec(); + if (weight_sizes.empty()) { + // correctly handle scalars + weight_sizes = {1}; + } + + TORCH_CHECK( + input_sizes == weight_sizes, + "torch.histogramdd: if weight tensor is provided it should have", + " the same shape as the input tensor excluding its innermost ", + "dimension, but got input with shape ", + input.sizes(), + " and weight ", + "with shape ", + weight.value().sizes()); + } +} + +/* Checks properties of output tensors hist and bin_edges, then resizes them. + */ +void histogramdd_prepare_out( + const Tensor& input, + int64_t bin_ct, + const Tensor& hist, + const Tensor& bin_edges) { + TORCH_CHECK( + input.dtype() == hist.dtype(), + "torch.histogram: input tensor and hist tensor should", + " have the same dtype, but got input ", + input.dtype(), + " and hist ", + hist.dtype()); + + TORCH_CHECK( + input.dtype() == bin_edges.dtype(), + "torch.histogram: input tensor and bin_edges tensor should", + " have the same dtype, but got input ", + input.dtype(), + " and bin_edges ", + bin_edges.dtype()); + + TORCH_CHECK( + bin_ct > 0, "torch.histogram(): bins must be > 0, but got ", bin_ct); + + at::native::resize_output(bin_edges, {bin_ct + 1}); + + at::native::resize_output(hist, {bin_ct}); +} + +void histogramdd_prepare_out( + const Tensor& input, + const Tensor& bins, + const Tensor& hist, + const Tensor& bin_edges) { + int64_t bin_ct = bins.numel() - 1; + histogramdd_prepare_out(input, bin_ct, hist, bin_edges); +} + +static Tensor& histogramdd_out( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + histogramdd_check_inputs(self, bins, weight); + histogramdd_prepare_out(self, bins, hist, bin_edges); + + bin_edges.copy_(bins); + + at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges); + return hist; +} + +std::tuple XPUNativeFunctions::histogram_out( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + Tensor reshaped_self = self.reshape({self.numel()}); + std::optional reshaped_weight = weight.has_value() + ? weight.value().reshape({weight.value().numel()}) + : weight; + + histogramdd_out( + reshaped_self, bins, reshaped_weight, density, hist, bin_edges); + + return std::forward_as_tuple(hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density) { + Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); + Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous); + return histogram_out(self, bins, weight, density, hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram_out( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + Tensor reshaped_self = self.reshape({self.numel()}); + std::optional reshaped_weight = weight.has_value() + ? weight.value().reshape({weight.value().numel()}) + : weight; + + histogramdd_prepare_out(reshaped_self, bin_ct, hist, bin_edges); + histogramdd_check_inputs(reshaped_self, bin_edges, reshaped_weight); + + at::native::xpu::histogramdd_linear_kernel( + reshaped_self, bin_ct, range, reshaped_weight, density, hist, bin_edges); + return std::forward_as_tuple(hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density) { + Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); + Tensor bin_edges_out = at::empty({0}, self.options()); + return histogram_out( + self, bin_ct, range, weight, density, hist, bin_edges_out); +} + +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h new file mode 100644 index 000000000..da153186a --- /dev/null +++ b/src/ATen/native/xpu/sycl/HistogramKernels.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace at::native::xpu { + +void histogramdd_kernel( + const Tensor& self, + const std::optional& weight, + bool density, + Tensor& hist, + const Tensor& bin_edges); + +void histogramdd_linear_kernel( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& out_bin_edges); + +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp new file mode 100644 index 000000000..be888e4b4 --- /dev/null +++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp @@ -0,0 +1,227 @@ +#pragma clang diagnostic push +#pragma GCC diagnostic push +// Avoid SYCL compiler return-type error +#pragma clang diagnostic ignored "-Wreturn-type" +#pragma GCC diagnostic ignored "-Wreturn-type" + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + +namespace at::native::xpu { + +template +struct HistogramddKernelFunctor { + void operator()(sycl::nd_item<1> item_id) const { + int64_t wi_id = item_id.get_global_id(); + if (wi_id < input_size_ * bin_size_) { + int64_t ele_idx = wi_id / bin_size_; + int64_t bin_idx = wi_id % bin_size_; + + // [left, right) + if (input_[ele_idx] >= bin_edges_[bin_idx] && + input_[ele_idx] < bin_edges_[bin_idx + 1]) { + scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin_idx), value); + return; + } + + // For last bin, [left, right] + if (bin_idx == 0 && input_[ele_idx] == bin_edges_[bin_size_]) { + scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin_size_ - 1), value); + } + } + } + + HistogramddKernelFunctor( + const scalar_t* input, + const scalar_t* bin_edges, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size) + : input_(input), + bin_edges_(bin_edges), + hist_(hist), + weight_(weight), + input_size_(input_size), + bin_size_(bin_size) {} + + private: + const scalar_t* input_; + const scalar_t* bin_edges_; + scalar_t* hist_; + const scalar_t* weight_; + int64_t input_size_; + int64_t bin_size_; +}; + +// For one dimension case +template +void histogramdd_template( + const scalar_t* input, + const scalar_t* bin_edges, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size) { + HistogramddKernelFunctor kfn( + input, bin_edges, hist, weight, input_size, bin_size); + const int64_t work_group_size = syclMaxWorkGroupSize(kfn); + const int64_t num_wg = + (input_size * bin_size + work_group_size - 1) / work_group_size; + sycl_kernel_submit( + num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn); +} + +template +struct HistogramddLinearKernelFunctor { + void operator()(sycl::nd_item<1> item_id) const { + int64_t wi_id = item_id.get_global_id(); + if (wi_id < input_size_) { + scalar_t i_value = input_[wi_id]; + if (i_value >= leftmost_edge_ && i_value <= rightmost_edge_) { + int64_t bin = + (int64_t)(((i_value - leftmost_edge_)) * bin_size_ / (rightmost_edge_ - leftmost_edge_)); + if (bin == bin_size_) + bin -= 1; + scalar_t value = weight_ ? weight_[wi_id] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin), value); + } + } + } + + HistogramddLinearKernelFunctor( + const scalar_t* input, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size, + double leftmost_edge, + double rightmost_edge) + : input_(input), + hist_(hist), + weight_(weight), + input_size_(input_size), + bin_size_(bin_size), + leftmost_edge_(leftmost_edge), + rightmost_edge_(rightmost_edge) {} + + private: + const scalar_t* input_; + scalar_t* hist_; + const scalar_t* weight_; + int64_t input_size_; + int64_t bin_size_; + double leftmost_edge_; + double rightmost_edge_; +}; + +// For one dimension case +template +void histogramdd_linear_template( + const scalar_t* input, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size, + double leftmost_edge, + double rightmost_edge) { + HistogramddLinearKernelFunctor kfn( + input, hist, weight, input_size, bin_size, leftmost_edge, rightmost_edge); + const int64_t work_group_size = syclMaxWorkGroupSize(kfn); + const int64_t num_wg = (input_size + work_group_size - 1) / work_group_size; + sycl_kernel_submit( + num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn); +} + +void histogramdd_kernel( + const Tensor& self, + const std::optional& weight, + bool density, + Tensor& hist, + const Tensor& bin_edges_) { + globalContext().alertNotDeterministic("histogramdd_kernel_xpu"); + hist.fill_(0); + Tensor bin_edges = bin_edges_.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, self.scalar_type(), "histogram_xpu", [&]() { + histogramdd_template( + self.data_ptr(), + bin_edges.data_ptr(), + hist.data_ptr(), + weight.has_value() ? weight->data_ptr() : nullptr, + self.numel(), + bin_edges.numel() - 1); + }); + + if (density) { + const auto hist_sum = hist.sum(); + hist.div_(hist_sum); + Tensor bin_lengths = bin_edges.diff(); + hist.div_(bin_lengths); + } +} + +void histogramdd_linear_kernel( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& out_bin_edges) { + globalContext().alertNotDeterministic("histogramdd_linear_kernel_xpu"); + hist.fill_(0); + + // default range for empty input + double leftmost_edge = 0., rightmost_edge = 1.; + if (range.has_value()) { + leftmost_edge = range.value()[0]; + rightmost_edge = range.value()[1]; + } else if (self.numel() > 0) { + auto extrema = at::aminmax(self); + leftmost_edge = std::get<0>(extrema).item(); + rightmost_edge = std::get<1>(extrema).item(); + } + + if (leftmost_edge == rightmost_edge) { + leftmost_edge -= 0.5; + rightmost_edge += 0.5; + } + + at::linspace_out(out_bin_edges, leftmost_edge, rightmost_edge, bin_ct + 1); + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, self.scalar_type(), "histogram_linear_xpu", [&]() { + histogramdd_linear_template( + self.data_ptr(), + hist.data_ptr(), + weight.has_value() ? weight->data_ptr() : nullptr, + self.numel(), + bin_ct, + leftmost_edge, + rightmost_edge); + }); + + if (density) { + const auto hist_sum = hist.sum(); + hist.div_(hist_sum); + Tensor bin_lengths = out_bin_edges.diff(); + hist.div_(bin_lengths); + } +} + +} // namespace at::native::xpu + +#pragma GCC diagnostic pop +#pragma clang diagnostic pop \ No newline at end of file diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 76790d817..a05d47e05 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -163,6 +163,11 @@ "test_compare_cpu_std_mean_xpu_bfloat16", "test_compare_cpu_sub_xpu_float16", "test_compare_cpu_var_mean_xpu_bfloat16", + + # test case doesn't make sense, will file an issue to track it. + # https://github.com/pytorch/pytorch/issues/130916 + "test_compare_cpu_histogram_xpu_float32", + "test_compare_cpu_histogram_xpu_float64", ) diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py index 792c7c569..1d9d3df6d 100644 --- a/test/xpu/extended/test_ops_xpu.py +++ b/test/xpu/extended/test_ops_xpu.py @@ -81,6 +81,12 @@ def test_compare_cpu(self, device, dtype, op): self.proxy = Namespace.TestCommonProxy() test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) test_common_test_fn(self.proxy, device, dtype, op) + # for CUDA doesn't support operators + elif (op.name in ["histogram",]): + if dtype in op.dtypes: + self.proxy = Namespace.TestCommonProxy() + test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) + test_common_test_fn(self.proxy, device, dtype, op) else: pytest.skip(f"{op.name} has not supported {dtype} yet both for cpu and xpu") diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 3ffc3feb6..313a2e8b3 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -783,6 +783,10 @@ def launch_test(test_case, skip_list=None, exe_list=None): # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf' "test_dtypes_cosh_xpu", + # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support + # but test_dtypes infrastructure leverage CUDA supported datatypes + "test_dtypes_histogram_xpu", + # The following dtypes worked in forward but are not listed by the OpInfo: {torch.float16}. # Align with CPU implementation since, # 1. most cases of nextafter require Half dtype. diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 96e43058a..6d989fa14 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -209,6 +209,7 @@ "aminmax", "argmin", "conj_physical", + "histogram", "repeat_interleave", "fmax", "fmin", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 48ddc4c3f..224c0e7d8 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -660,6 +660,10 @@ supported: - ceil - ceil_ - ceil.out + - histogram.bins_tensor + - histogram.bins_tensor_out + - histogram.bin_ct + - histogram.bin_ct_out - repeat_interleave.Tensor - norm.ScalarOpt_dim_dtype - norm.dtype_out From bbd200abe2d3a3e290c45e04127723f65293ed7e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 25 Jul 2024 18:27:53 +0800 Subject: [PATCH 8/8] Timm_regnet regression which tracked in issue 603 (#650) Skip check and tracked in https://github.com/intel/torch-xpu-ops/issues/603 --- .github/ci_expected_accuracy/inductor_torchbench_training.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv index 2984ecbc9..dc766eac0 100644 --- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv +++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv @@ -93,7 +93,7 @@ tacotron2,fail_to_run,fail_to_run,fail_to_run,fail_to_run,fail_to_run timm_efficientdet,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load timm_efficientnet,pass,pass,pass,pass,pass timm_nfnet,pass,pass,pass,pass,pass -timm_regnet,pass,pass,pass,pass,pass +timm_regnet,pass,fail_accuracy,pass,pass,pass timm_resnest,pass,pass,pass,pass,pass timm_vision_transformer,pass,pass,pass,pass,pass timm_vision_transformer_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip