From fbd095a57bb146c1360e2dfe5c8cd7ef615567ae Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Tue, 23 Jul 2024 20:04:00 -0700 Subject: [PATCH 01/17] Drop CPU fallback for aten::norm.out and aten::nextafter.out (#641) Fixes: #640 Fixes: 6eca394 ("Add aten::nextafter and its variants") CC: @fengyuan14 @yucai-intel @xytintel Signed-off-by: Dmitry Rogozhkin --- src/ATen/native/xpu/XPUFallback.template | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index d5ae19d5a..52b417415 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -239,8 +239,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "nanmedian", "nanmedian.dim_values", "nansum", - "norm.out", - "nextafter.out", "ormqr", "_pdist_backward", "_pdist_forward", From 07ebadbd582305ca4dac2759358e8be651979df4 Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Wed, 24 Jul 2024 11:05:17 +0800 Subject: [PATCH 02/17] Update the comments of the radix sort kernel (#633) Update the comments of the radix sort kernel. Co-authored-by: Feng Yuan --- src/ATen/native/xpu/sycl/SortingKernels.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h index fb29a9f53..b39595143 100644 --- a/src/ATen/native/xpu/sycl/SortingKernels.h +++ b/src/ATen/native/xpu/sycl/SortingKernels.h @@ -539,9 +539,7 @@ void segmented_sort_pairs_( int num_elements) { constexpr int scaling_coef = sizeof(key_t) * sizeof(value_t) >= 64 ? 2 - : 1; // Attempt to reduce register pressure. The result will be incorrect - // when using too many local variables (registers). - // https://github.com/intel/torch-xpu-ops/issues/626 + : 1; // Attempt to reduce register pressure for performance. if (num_elements > 4096 / scaling_coef) { // Considering register pressure, we use a problem size of 4096 to delineate // the boundary between single tile sort and group sort. From f716a582852f562e2ad26a0381885b0509168fe9 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Wed, 24 Jul 2024 13:54:54 +0800 Subject: [PATCH 03/17] Add aten::repeat_interleave (#564) ![image](https://github.com/intel/torch-xpu-ops/assets/105194461/1fe5b8bc-c82a-41eb-b673-7c5cb6f5d3cf) --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/ReduceOps.cpp | 1 - src/ATen/native/xpu/Repeat.cpp | 10 +++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/RepeatKernel.cpp | 79 +++++++++++++++++++++++ src/ATen/native/xpu/sycl/RepeatKernel.h | 9 +++ test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 1 + 7 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 src/ATen/native/xpu/Repeat.cpp create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.cpp create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.h diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp index 99069eefb..826b1e27b 100644 --- a/src/ATen/native/xpu/ReduceOps.cpp +++ b/src/ATen/native/xpu/ReduceOps.cpp @@ -47,7 +47,6 @@ static void cum_ops_meta( maybe_wrap_dim(dim, self.dim()); ScalarType out_dtype; - if (result.defined()) { out_dtype = dtype.value_or(result.scalar_type()); at::xpu::resize_out( diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp new file mode 100644 index 000000000..38e5ae8da --- /dev/null +++ b/src/ATen/native/xpu/Repeat.cpp @@ -0,0 +1,10 @@ +#include +#include +#include +namespace at { +Tensor XPUNativeFunctions::repeat_interleave( + const Tensor& repeats, + c10::optional output_size) { + return at::native::xpu::repeat_interleave_kernel(repeats, output_size); +} +} // namespace at diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 52b417415..356a45f70 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -248,7 +248,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "prod", "prod.int_out", "put_", - "repeat_interleave.Tensor", "replication_pad1d_backward.grad_input", "replication_pad1d.out", "replication_pad2d_backward", diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp new file mode 100644 index 000000000..c3e27ab62 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +namespace at::native::xpu { +template +struct RepeatInterleaveKernelFunctor { + void operator()(sycl::nd_item<1> item) const { + auto rep_ptr = rep_data_; + auto cum_ptr = cum_data_; + auto res_ptr = res_data_; + + for (int64_t i = item.get_global_id(0); i < size_; + i += item.get_global_range()[0]) { + int64_t end = cum_ptr[i]; + int64_t repeat = rep_ptr[i]; + int64_t start = end - repeat; + for (int64_t j = start; j < end; j++) { + res_ptr[j] = i; + } + } + } + RepeatInterleaveKernelFunctor( + const index_t* rep_data, + const int64_t* cum_data, + index_t* res_data, + int64_t size, + int64_t result_size) + : rep_data_(rep_data), + cum_data_(cum_data), + res_data_(res_data), + size_(size), + result_size_(result_size) {} + + private: + const index_t* rep_data_; + const int64_t* cum_data_; + index_t* res_data_; + int64_t size_; + int64_t result_size_; +}; + +template +static void compute_xpu( + const index_t* repeat_ptr, + const int64_t* cumsum_ptr, + index_t* result_ptr, + int64_t size, + int64_t result_size) { + if (size == 0) + return; + + auto kfn = RepeatInterleaveKernelFunctor( + repeat_ptr, + cumsum_ptr, + result_ptr, + size, + result_size); + + int64_t wg_size = syclMaxWorkGroupSize(kfn); + int64_t local_range = size < wg_size ? size : wg_size; + int64_t global_range = ((size + local_range - 1) / local_range) * local_range; + + auto queue = getCurrentSYCLQueue(); + sycl_kernel_submit(global_range, local_range, queue, kfn); +} + +Tensor repeat_interleave_kernel( + const Tensor& repeat, + c10::optional output_size) { + Tensor output; + + AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_xpu", [&] { + output = repeat_interleave_common>( + repeat, output_size); + }); + return output; +} +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h new file mode 100644 index 000000000..3df41a956 --- /dev/null +++ b/src/ATen/native/xpu/sycl/RepeatKernel.h @@ -0,0 +1,9 @@ +#pragma once +#include +namespace at::native::xpu { + +Tensor repeat_interleave_kernel( + const Tensor& repeats, + c10::optional output_size); + +} // namespace at::native::xpu \ No newline at end of file diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 3e3b5d4d1..96e43058a 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -209,6 +209,7 @@ "aminmax", "argmin", "conj_physical", + "repeat_interleave", "fmax", "fmin", "floor", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index cd6f80b84..48ddc4c3f 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -660,6 +660,7 @@ supported: - ceil - ceil_ - ceil.out + - repeat_interleave.Tensor - norm.ScalarOpt_dim_dtype - norm.dtype_out - norm.ScalarOpt_dim From d25d3d09a2a01867bf613f9cc535c596832eb2e8 Mon Sep 17 00:00:00 2001 From: yuchengliu1 Date: Wed, 24 Jul 2024 14:42:30 +0800 Subject: [PATCH 04/17] pow_Tensor_Scalar: fixing error result when scalar exp is a bool. (#638) Fixing TestBinaryUfuncsXPU.test_pow_xpu_int --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Pow.cpp | 8 +++++++- test/xpu/run_test_with_skip.py | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp index df335bfa1..97dc5a0c2 100644 --- a/src/ATen/native/xpu/Pow.cpp +++ b/src/ATen/native/xpu/Pow.cpp @@ -58,7 +58,13 @@ Tensor& XPUNativeFunctions::pow_out( Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) { Tensor out; auto iter = pow_tensor_scalar_meta(self, exponent, out); - native::xpu::pow_tensor_scalar_kernel(iter, exponent); + if (exponent.equal(0.0) || exponent.equal(false)) { + iter.output().fill_(1); + } else if (exponent.equal(1.0) || exponent.equal(true)) { + iter.output().copy_(self); + } else { + native::xpu::pow_tensor_scalar_kernel(iter, exponent); + } return iter.output(); } diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 0e00da513..3ffc3feb6 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -797,12 +797,13 @@ def launch_test(test_case, skip_list=None, exe_list=None): skip_list = ( "test_fmod_remainder_by_zero_integral_xpu_int64", # zero division is an undefined behavior: different handles on different backends "test_div_rounding_numpy_xpu_float16", # Calculation error. XPU implementation uses opmath type. - # RuntimeError: false INTERNAL ASSERT FAILED at "torch-xpu-ops/src/ATen/native/xpu/sycl/PowKernels.cpp":233, please report a bug to PyTorch. invalid combination of type in Pow function, common dtype: Short, exp is integral? 0 + # fail in complex_exponents=[-1.0 - 1.5j, 3.3j] + # Mismatched elements: 33 / 100 (33.0%) + # Greatest absolute difference: 0.00038337233127094805 at index (4,) (up to 1e-05 allowed) + # Greatest relative difference: 1.9085073290625587e-06 at index (6,) (up to 1.3e-06 allowed) "test_pow_xpu_int16", "test_pow_xpu_int32", "test_pow_xpu_int64", - "test_pow_xpu_int8", - "test_pow_xpu_uint8", # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available. "_jiterator_", # Unexpected success From e1e195bd4d6f9541fc05c54579242f07f30a63b3 Mon Sep 17 00:00:00 2001 From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:55:52 +0800 Subject: [PATCH 05/17] Support XPU ABI=0 build (#531) # Motivation Support XPU ABI neutral build. Starting from compiler 2025.0, `libsycl.so` is ABI-neutral lib. Before this, the `libsycl-preview.so` is ABI-neutral. refer to https://github.com/pytorch/pytorch/pull/130110 --- cmake/BuildFlags.cmake | 6 +- cmake/Modules/FindSYCL.cmake | 5 +- cmake/Modules/FindSYCLToolkit.cmake | 73 ++++++------------------- cmake/SYCL.cmake | 18 ------ src/ATen/CMakeLists.txt | 2 +- src/ATen/native/xpu/sycl/AmpKernels.cpp | 4 +- src/CMakeLists.txt | 2 +- test/sycl/CMakeLists.txt | 2 +- 8 files changed, 28 insertions(+), 84 deletions(-) diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index d0eb3e299..2f05d1ab0 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -59,10 +59,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func) set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value) set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz) + # Equivalent to build option -fpreview-breaking-changes for SYCL compiler. + set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES) + set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}) endif() - # TODO: Align with PyTorch and switch to ABI=0 eventually, after - # resolving incompatible implementation in SYCL runtime. - set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1) set(SYCL_FLAGS ${SYCL_FLAGS} ${SYCL_KERNEL_OPTIONS}) set(TORCH_XPU_OPS_FLAGS ${SYCL_HOST_FLAGS}) diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake index b8fc8ba22..4e266a26f 100644 --- a/cmake/Modules/FindSYCL.cmake +++ b/cmake/Modules/FindSYCL.cmake @@ -407,7 +407,6 @@ macro(SYCL_LINK_DEVICE_OBJECTS output_file sycl_target) OUTPUT ${output_file} DEPENDS ${object_files} COMMAND ${SYCL_EXECUTABLE} - -fsycl ${SYCL_device_link_flags} -fsycl-link ${object_files} -Xs "\"${SYCL_OFFLINE_COMPILER_FLAGS}\"" @@ -471,7 +470,7 @@ macro(SYCL_ADD_LIBRARY sycl_target) target_link_libraries( ${sycl_target} ${SYCL_LINK_LIBRARIES_KEYWORD} - ${SYCL_LIBRARIES}) + ${SYCL_LIBRARY}) set_target_properties(${sycl_target} PROPERTIES @@ -530,7 +529,7 @@ macro(SYCL_ADD_EXECUTABLE sycl_target) target_link_libraries( ${sycl_target} ${SYCL_LINK_LIBRARIES_KEYWORD} - ${SYCL_LIBRARIES}) + ${SYCL_LIBRARY}) set_target_properties(${sycl_target} PROPERTIES diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake index e478cb4c5..46e34c7f8 100644 --- a/cmake/Modules/FindSYCLToolkit.cmake +++ b/cmake/Modules/FindSYCLToolkit.cmake @@ -25,16 +25,20 @@ This will define the following variables: #]=======================================================================] -set(SYCLTOOLKIT_FOUND False) -include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) +include(${TORCH_ROOT}/cmake/Modules/FindSYCLToolkit.cmake) -set(SYCL_ROOT "") -if(DEFINED ENV{SYCL_ROOT}) - set(SYCL_ROOT $ENV{SYCL_ROOT}) -elseif(DEFINED ENV{CMPLR_ROOT}) - set(SYCL_ROOT $ENV{CMPLR_ROOT}) +if(NOT SYCL_FOUND) + set(SYCLTOOLKIT_FOUND FALSE) + return() endif() +if(SYCLTOOLKIT_FOUND) + return() +endif() +set(SYCLTOOLKIT_FOUND TRUE) + +include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake) + if(WIN32) set(SYCL_EXECUTABLE_NAME icx) else() @@ -71,43 +75,6 @@ if(nocmplr) set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") endif() -find_file( - SYCL_INCLUDE_DIR - NAMES include - HINTS ${SYCL_ROOT} - NO_DEFAULT_PATH - ) - -find_file( - SYCL_INCLUDE_SYCL_DIR - NAMES sycl - HINTS ${SYCL_ROOT}/include - NO_DEFAULT_PATH - ) - -list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR}) - -find_file( - SYCL_LIBRARY_DIR - NAMES lib lib64 - HINTS ${SYCL_ROOT} - NO_DEFAULT_PATH - ) - -find_library( - SYCL_LIBRARY - NAMES sycl - HINTS ${SYCL_LIBRARY_DIR} - NO_DEFAULT_PATH -) - -if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY)) - set(SYCLTOOLKIT_FOUND False) - set(SYCL_REASON_FAILURE "SYCL sdk is incomplete!!") - set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") - return() -endif() - # Function to write a test case to verify SYCL features. function(SYCL_CMPLR_TEST_WRITE src) @@ -202,6 +169,13 @@ set(SYCL_FLAGS "") set(SYCL_LINK_FLAGS "") list(APPEND SYCL_FLAGS "-fsycl") list(APPEND SYCL_LINK_FLAGS "-fsycl") +if(LINUX) + string(REGEX MATCH "libsycl-preview.so" is_abi_neutral ${SYCL_LIBRARY}) + if(is_abi_neutral) + list(APPEND SYCL_FLAGS "-fpreview-breaking-changes") + list(APPEND SYCL_LINK_FLAGS "-fpreview-breaking-changes") + endif() +endif() set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") @@ -249,14 +223,3 @@ endif() message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}") message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}") - -# Avoid module variables conflict due to calling find_package recursively -# e.g. find_package -> add_subdirectory(entering in a sub-project) -> find_package -# find_package_handle_standard_args( -# SYCLToolkit -# FOUND_VAR SYCLTOOLKIT_FOUND -# REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY SYCL_FLAGS -# VERSION_VAR SYCL_LANGUAGE_VERSION -# REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}") -set(SYCLTOOLKIT_FOUND True) - diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake index b2bbadaa4..f0724ef55 100644 --- a/cmake/SYCL.cmake +++ b/cmake/SYCL.cmake @@ -25,24 +25,6 @@ if(NOT SYCL_VERSION) return() endif() -find_library(SYCL_LIBRARIES sycl HINTS ${SYCL_LIBRARY_DIR}) -# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix, -# where the current version of the SYCL runtime is 7. -# Until oneAPI adds support to sycl.lib without the version suffix, -# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts. -# TODO: remove this when sycl.lib is supported on Windows -if(WIN32) - set(sycl_runtime_version 7) - find_library( - SYCL_LIBRARIES - NAMES "sycl${sycl_runtime_version}" - HINTS ${SYCL_LIBRARY_DIR} - ) - if(SYCL_LIBRARIES STREQUAL "SYCL_LIBRARIES-NOTFOUND") - message(FATAL_ERROR "Cannot find a SYCL library on Windows") - endif() -endif() - set(SYCL_COMPILER_VERSION) file(READ ${SYCL_VERSION} version_contents) string(REGEX MATCHALL "__SYCL_COMPILER_VERSION +[0-9]+" VERSION_LINE "${version_contents}") diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt index c95e860fa..815ad018f 100644 --- a/src/ATen/CMakeLists.txt +++ b/src/ATen/CMakeLists.txt @@ -1,6 +1,6 @@ # ATen XPU sources -file(GLOB xpu_cpp "xpu/*.cpp", "native/xpu/*.cpp", "native/sparse/*.cpp") +file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp") file(GLOB xpu_sycl "native/xpu/sycl/*.cpp") list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp}) diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp index 5e8d258a6..87354eea9 100644 --- a/src/ATen/native/xpu/sycl/AmpKernels.cpp +++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp @@ -102,9 +102,9 @@ void amp_foreach_non_finite_check_and_unscale_kernel( } struct AmpUpdateScaleKernelFunctor { - void operator()(sycl::item<1> item) const { + void operator()(sycl::nd_item<1> item) const { // There is only single item/task scheduled. - if (item.get_linear_id() != 0) + if (item.get_global_linear_id() != 0) return; if (*found_inf_) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5e0a8549..fa17491a2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,7 +82,7 @@ foreach(lib ${TORCH_XPU_OPS_LIBRARIES}) target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS}) target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR}) - target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARIES}) + target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY}) endforeach() include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake) diff --git a/test/sycl/CMakeLists.txt b/test/sycl/CMakeLists.txt index 0671fbc94..5bbee8643 100644 --- a/test/sycl/CMakeLists.txt +++ b/test/sycl/CMakeLists.txt @@ -42,7 +42,7 @@ add_dependencies(test_sycl_build_archive sycl_simple_kernel_test) # Instead, we use explicit linkage option '--whole-archive', which is required # by linkage of device object modules archived in the static library. Then # explicit linkage configuration of SYCL runtime library is required. -target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARIES}) +target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARY}) if(INSTALL_TEST) install(TARGETS test_sycl_build_archive DESTINATION bin) From fcaa34cb5c740c81fab2126b0a68e5f4c724324a Mon Sep 17 00:00:00 2001 From: majing Date: Thu, 25 Jul 2024 08:45:49 +0800 Subject: [PATCH 06/17] Add aten::histogram and variant (#569) CPU/MPS only ops --------- Signed-off-by: majing Signed-off-by: Ma, Jing1 Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Histogram.cpp | 195 +++++++++++++++ src/ATen/native/xpu/sycl/HistogramKernels.h | 23 ++ .../native/xpu/sycl/HistogramddKernels.cpp | 227 ++++++++++++++++++ test/xpu/extended/run_test_with_skip.py | 5 + test/xpu/extended/test_ops_xpu.py | 6 + test/xpu/run_test_with_skip.py | 4 + test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 4 + 8 files changed, 465 insertions(+) create mode 100644 src/ATen/native/xpu/Histogram.cpp create mode 100644 src/ATen/native/xpu/sycl/HistogramKernels.h create mode 100644 src/ATen/native/xpu/sycl/HistogramddKernels.cpp diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp new file mode 100644 index 000000000..de49952b2 --- /dev/null +++ b/src/ATen/native/xpu/Histogram.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include + +namespace at { + +/* Checks properties of input tensors input, bins, and weight. + */ +void histogramdd_check_inputs( + const Tensor& input, + const Tensor& bins, + const std::optional& weight) { + if (weight.has_value()) { + TORCH_CHECK( + weight->device() == input.device(), + "weight and input need to be on the same device.") + } + auto input_dtype = input.dtype(); + auto bins_dtype = bins.dtype(); + TORCH_CHECK( + input_dtype == bins_dtype, + "torch.histogramdd: input tensor and bins tensors should", + " have the same dtype, but got input with dtype ", + input_dtype, + " and bins with dtype ", + bins_dtype); + + const int64_t bins_dim = bins.dim(); + TORCH_CHECK( + bins_dim == 1, + "torch.histogramdd: bins tensor should have one dimension,", + " but got ", + bins_dim, + " dimensions in the bin tensor"); + + const int64_t numel = bins.numel(); + TORCH_CHECK( + numel > 0, + "torch.histogramdd: bins tensor should have at least 1 element,", + " but got ", + numel, + " elements in the bin tensor"); + + if (weight.has_value()) { + TORCH_CHECK( + input.dtype() == weight.value().dtype(), + "torch.histogramdd: if weight tensor is provided, ", + "input tensor and weight tensor should have the same dtype, ", + "but got input(", + input.dtype(), + ")", + ", and weight(", + weight.value().dtype(), + ")"); + + auto input_sizes = input.sizes().vec(); + + auto weight_sizes = weight.value().sizes().vec(); + if (weight_sizes.empty()) { + // correctly handle scalars + weight_sizes = {1}; + } + + TORCH_CHECK( + input_sizes == weight_sizes, + "torch.histogramdd: if weight tensor is provided it should have", + " the same shape as the input tensor excluding its innermost ", + "dimension, but got input with shape ", + input.sizes(), + " and weight ", + "with shape ", + weight.value().sizes()); + } +} + +/* Checks properties of output tensors hist and bin_edges, then resizes them. + */ +void histogramdd_prepare_out( + const Tensor& input, + int64_t bin_ct, + const Tensor& hist, + const Tensor& bin_edges) { + TORCH_CHECK( + input.dtype() == hist.dtype(), + "torch.histogram: input tensor and hist tensor should", + " have the same dtype, but got input ", + input.dtype(), + " and hist ", + hist.dtype()); + + TORCH_CHECK( + input.dtype() == bin_edges.dtype(), + "torch.histogram: input tensor and bin_edges tensor should", + " have the same dtype, but got input ", + input.dtype(), + " and bin_edges ", + bin_edges.dtype()); + + TORCH_CHECK( + bin_ct > 0, "torch.histogram(): bins must be > 0, but got ", bin_ct); + + at::native::resize_output(bin_edges, {bin_ct + 1}); + + at::native::resize_output(hist, {bin_ct}); +} + +void histogramdd_prepare_out( + const Tensor& input, + const Tensor& bins, + const Tensor& hist, + const Tensor& bin_edges) { + int64_t bin_ct = bins.numel() - 1; + histogramdd_prepare_out(input, bin_ct, hist, bin_edges); +} + +static Tensor& histogramdd_out( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + histogramdd_check_inputs(self, bins, weight); + histogramdd_prepare_out(self, bins, hist, bin_edges); + + bin_edges.copy_(bins); + + at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges); + return hist; +} + +std::tuple XPUNativeFunctions::histogram_out( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + Tensor reshaped_self = self.reshape({self.numel()}); + std::optional reshaped_weight = weight.has_value() + ? weight.value().reshape({weight.value().numel()}) + : weight; + + histogramdd_out( + reshaped_self, bins, reshaped_weight, density, hist, bin_edges); + + return std::forward_as_tuple(hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram( + const Tensor& self, + const Tensor& bins, + const std::optional& weight, + bool density) { + Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); + Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous); + return histogram_out(self, bins, weight, density, hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram_out( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& bin_edges) { + Tensor reshaped_self = self.reshape({self.numel()}); + std::optional reshaped_weight = weight.has_value() + ? weight.value().reshape({weight.value().numel()}) + : weight; + + histogramdd_prepare_out(reshaped_self, bin_ct, hist, bin_edges); + histogramdd_check_inputs(reshaped_self, bin_edges, reshaped_weight); + + at::native::xpu::histogramdd_linear_kernel( + reshaped_self, bin_ct, range, reshaped_weight, density, hist, bin_edges); + return std::forward_as_tuple(hist, bin_edges); +} + +std::tuple XPUNativeFunctions::histogram( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density) { + Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous); + Tensor bin_edges_out = at::empty({0}, self.options()); + return histogram_out( + self, bin_ct, range, weight, density, hist, bin_edges_out); +} + +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h new file mode 100644 index 000000000..da153186a --- /dev/null +++ b/src/ATen/native/xpu/sycl/HistogramKernels.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace at::native::xpu { + +void histogramdd_kernel( + const Tensor& self, + const std::optional& weight, + bool density, + Tensor& hist, + const Tensor& bin_edges); + +void histogramdd_linear_kernel( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& out_bin_edges); + +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp new file mode 100644 index 000000000..be888e4b4 --- /dev/null +++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp @@ -0,0 +1,227 @@ +#pragma clang diagnostic push +#pragma GCC diagnostic push +// Avoid SYCL compiler return-type error +#pragma clang diagnostic ignored "-Wreturn-type" +#pragma GCC diagnostic ignored "-Wreturn-type" + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + +namespace at::native::xpu { + +template +struct HistogramddKernelFunctor { + void operator()(sycl::nd_item<1> item_id) const { + int64_t wi_id = item_id.get_global_id(); + if (wi_id < input_size_ * bin_size_) { + int64_t ele_idx = wi_id / bin_size_; + int64_t bin_idx = wi_id % bin_size_; + + // [left, right) + if (input_[ele_idx] >= bin_edges_[bin_idx] && + input_[ele_idx] < bin_edges_[bin_idx + 1]) { + scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin_idx), value); + return; + } + + // For last bin, [left, right] + if (bin_idx == 0 && input_[ele_idx] == bin_edges_[bin_size_]) { + scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin_size_ - 1), value); + } + } + } + + HistogramddKernelFunctor( + const scalar_t* input, + const scalar_t* bin_edges, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size) + : input_(input), + bin_edges_(bin_edges), + hist_(hist), + weight_(weight), + input_size_(input_size), + bin_size_(bin_size) {} + + private: + const scalar_t* input_; + const scalar_t* bin_edges_; + scalar_t* hist_; + const scalar_t* weight_; + int64_t input_size_; + int64_t bin_size_; +}; + +// For one dimension case +template +void histogramdd_template( + const scalar_t* input, + const scalar_t* bin_edges, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size) { + HistogramddKernelFunctor kfn( + input, bin_edges, hist, weight, input_size, bin_size); + const int64_t work_group_size = syclMaxWorkGroupSize(kfn); + const int64_t num_wg = + (input_size * bin_size + work_group_size - 1) / work_group_size; + sycl_kernel_submit( + num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn); +} + +template +struct HistogramddLinearKernelFunctor { + void operator()(sycl::nd_item<1> item_id) const { + int64_t wi_id = item_id.get_global_id(); + if (wi_id < input_size_) { + scalar_t i_value = input_[wi_id]; + if (i_value >= leftmost_edge_ && i_value <= rightmost_edge_) { + int64_t bin = + (int64_t)(((i_value - leftmost_edge_)) * bin_size_ / (rightmost_edge_ - leftmost_edge_)); + if (bin == bin_size_) + bin -= 1; + scalar_t value = weight_ ? weight_[wi_id] : (scalar_t)1; + atomicAdd((sycl_global_ptr)(hist_ + bin), value); + } + } + } + + HistogramddLinearKernelFunctor( + const scalar_t* input, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size, + double leftmost_edge, + double rightmost_edge) + : input_(input), + hist_(hist), + weight_(weight), + input_size_(input_size), + bin_size_(bin_size), + leftmost_edge_(leftmost_edge), + rightmost_edge_(rightmost_edge) {} + + private: + const scalar_t* input_; + scalar_t* hist_; + const scalar_t* weight_; + int64_t input_size_; + int64_t bin_size_; + double leftmost_edge_; + double rightmost_edge_; +}; + +// For one dimension case +template +void histogramdd_linear_template( + const scalar_t* input, + scalar_t* hist, + const scalar_t* weight, + int64_t input_size, + int64_t bin_size, + double leftmost_edge, + double rightmost_edge) { + HistogramddLinearKernelFunctor kfn( + input, hist, weight, input_size, bin_size, leftmost_edge, rightmost_edge); + const int64_t work_group_size = syclMaxWorkGroupSize(kfn); + const int64_t num_wg = (input_size + work_group_size - 1) / work_group_size; + sycl_kernel_submit( + num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn); +} + +void histogramdd_kernel( + const Tensor& self, + const std::optional& weight, + bool density, + Tensor& hist, + const Tensor& bin_edges_) { + globalContext().alertNotDeterministic("histogramdd_kernel_xpu"); + hist.fill_(0); + Tensor bin_edges = bin_edges_.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, self.scalar_type(), "histogram_xpu", [&]() { + histogramdd_template( + self.data_ptr(), + bin_edges.data_ptr(), + hist.data_ptr(), + weight.has_value() ? weight->data_ptr() : nullptr, + self.numel(), + bin_edges.numel() - 1); + }); + + if (density) { + const auto hist_sum = hist.sum(); + hist.div_(hist_sum); + Tensor bin_lengths = bin_edges.diff(); + hist.div_(bin_lengths); + } +} + +void histogramdd_linear_kernel( + const Tensor& self, + int64_t bin_ct, + std::optional> range, + const std::optional& weight, + bool density, + Tensor& hist, + Tensor& out_bin_edges) { + globalContext().alertNotDeterministic("histogramdd_linear_kernel_xpu"); + hist.fill_(0); + + // default range for empty input + double leftmost_edge = 0., rightmost_edge = 1.; + if (range.has_value()) { + leftmost_edge = range.value()[0]; + rightmost_edge = range.value()[1]; + } else if (self.numel() > 0) { + auto extrema = at::aminmax(self); + leftmost_edge = std::get<0>(extrema).item(); + rightmost_edge = std::get<1>(extrema).item(); + } + + if (leftmost_edge == rightmost_edge) { + leftmost_edge -= 0.5; + rightmost_edge += 0.5; + } + + at::linspace_out(out_bin_edges, leftmost_edge, rightmost_edge, bin_ct + 1); + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, self.scalar_type(), "histogram_linear_xpu", [&]() { + histogramdd_linear_template( + self.data_ptr(), + hist.data_ptr(), + weight.has_value() ? weight->data_ptr() : nullptr, + self.numel(), + bin_ct, + leftmost_edge, + rightmost_edge); + }); + + if (density) { + const auto hist_sum = hist.sum(); + hist.div_(hist_sum); + Tensor bin_lengths = out_bin_edges.diff(); + hist.div_(bin_lengths); + } +} + +} // namespace at::native::xpu + +#pragma GCC diagnostic pop +#pragma clang diagnostic pop \ No newline at end of file diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 76790d817..a05d47e05 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -163,6 +163,11 @@ "test_compare_cpu_std_mean_xpu_bfloat16", "test_compare_cpu_sub_xpu_float16", "test_compare_cpu_var_mean_xpu_bfloat16", + + # test case doesn't make sense, will file an issue to track it. + # https://github.com/pytorch/pytorch/issues/130916 + "test_compare_cpu_histogram_xpu_float32", + "test_compare_cpu_histogram_xpu_float64", ) diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py index 792c7c569..1d9d3df6d 100644 --- a/test/xpu/extended/test_ops_xpu.py +++ b/test/xpu/extended/test_ops_xpu.py @@ -81,6 +81,12 @@ def test_compare_cpu(self, device, dtype, op): self.proxy = Namespace.TestCommonProxy() test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) test_common_test_fn(self.proxy, device, dtype, op) + # for CUDA doesn't support operators + elif (op.name in ["histogram",]): + if dtype in op.dtypes: + self.proxy = Namespace.TestCommonProxy() + test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) + test_common_test_fn(self.proxy, device, dtype, op) else: pytest.skip(f"{op.name} has not supported {dtype} yet both for cpu and xpu") diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 3ffc3feb6..313a2e8b3 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -783,6 +783,10 @@ def launch_test(test_case, skip_list=None, exe_list=None): # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf' "test_dtypes_cosh_xpu", + # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support + # but test_dtypes infrastructure leverage CUDA supported datatypes + "test_dtypes_histogram_xpu", + # The following dtypes worked in forward but are not listed by the OpInfo: {torch.float16}. # Align with CPU implementation since, # 1. most cases of nextafter require Half dtype. diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 96e43058a..6d989fa14 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -209,6 +209,7 @@ "aminmax", "argmin", "conj_physical", + "histogram", "repeat_interleave", "fmax", "fmin", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 48ddc4c3f..224c0e7d8 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -660,6 +660,10 @@ supported: - ceil - ceil_ - ceil.out + - histogram.bins_tensor + - histogram.bins_tensor_out + - histogram.bin_ct + - histogram.bin_ct_out - repeat_interleave.Tensor - norm.ScalarOpt_dim_dtype - norm.dtype_out From bbd200abe2d3a3e290c45e04127723f65293ed7e Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Thu, 25 Jul 2024 18:27:53 +0800 Subject: [PATCH 07/17] Timm_regnet regression which tracked in issue 603 (#650) Skip check and tracked in https://github.com/intel/torch-xpu-ops/issues/603 --- .github/ci_expected_accuracy/inductor_torchbench_training.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv index 2984ecbc9..dc766eac0 100644 --- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv +++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv @@ -93,7 +93,7 @@ tacotron2,fail_to_run,fail_to_run,fail_to_run,fail_to_run,fail_to_run timm_efficientdet,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load timm_efficientnet,pass,pass,pass,pass,pass timm_nfnet,pass,pass,pass,pass,pass -timm_regnet,pass,pass,pass,pass,pass +timm_regnet,pass,fail_accuracy,pass,pass,pass timm_resnest,pass,pass,pass,pass,pass timm_vision_transformer,pass,pass,pass,pass,pass timm_vision_transformer_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip From c2405f25a2a886b91f5a84432721511db977a5dd Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Sat, 27 Jul 2024 16:44:35 +0800 Subject: [PATCH 08/17] Add aten::nansum and aten::nansum.out (#567) Add aten::nansum and aten::nansum.out. --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/ReduceOps.cpp | 34 ++++++++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/ReduceOpsKernels.h | 2 + .../native/xpu/sycl/ReduceSumProdKernels.cpp | 82 ++++++++++++++++--- test/xpu/extended/run_test_with_skip.py | 2 + test/xpu/xpu_test_utils.py | 2 + yaml/xpu_functions.yaml | 2 + 7 files changed, 111 insertions(+), 14 deletions(-) diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp index 826b1e27b..33fb03823 100644 --- a/src/ATen/native/xpu/ReduceOps.cpp +++ b/src/ATen/native/xpu/ReduceOps.cpp @@ -869,6 +869,40 @@ Tensor XPUNativeFunctions::amin( return out; } +Tensor& XPUNativeFunctions::nansum_out( + const Tensor& self, + at::OptionalIntArrayRef dim, + bool keepdim, + optional opt_dtype, + Tensor& result) { + // For integral types, use existing sum as + // integral types don't have `Nan`. + if (c10::isIntegralType(self.scalar_type(), true)) { + return at::sum_out(result, self, dim, keepdim, opt_dtype); + } + + auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result); + result = resize_reduction(result, self, dim, keepdim, out_dtype); + auto iter = meta::make_reduction_from_out_ty( + self, result, dim, keepdim, result.scalar_type()); + + if (iter.numel() == 0) { + result = result.zero_(); + } else { + native::xpu::nansum_kernel(iter); + } + return result; +} + +Tensor XPUNativeFunctions::nansum( + const Tensor& self, + at::OptionalIntArrayRef dim, + bool keepdim, + std::optional opt_dtype) { + Tensor result; + return XPUNativeFunctions::nansum_out(self, dim, keepdim, opt_dtype, result); +} + static ScalarType get_result_or_self_value_dtype( const Tensor& self, const Tensor& result, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 356a45f70..b8969548c 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -238,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "multi_margin_loss_backward", "nanmedian", "nanmedian.dim_values", - "nansum", "ormqr", "_pdist_backward", "_pdist_forward", diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h index 9ee0ee955..4a54766ae 100644 --- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h +++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h @@ -16,6 +16,8 @@ void mean_kernel(TensorIterator& iter); void sum_kernel(TensorIterator& iter); +void nansum_kernel(TensorIterator& iter); + void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt); void aminmax_kernel(TensorIterator& iter); diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp index 728a75582..d68ca376b 100644 --- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp +++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -8,6 +9,36 @@ namespace at { namespace native { namespace xpu { +// The function `reduce_dispatch` below dispatches to the kernel based +// on the type of `iter`. It takes care of the common logic +// for handling Half-Precision floating types. +// Otherwise the functor `op` is called to dispatch to the kernel +// of relevant type. +// +// Note: Functor `op` should take care of all the types to be supported +// except for `at::Half` and `at::BFloat16`. +template < + template < + typename scalar_t, + typename acc_t = scalar_t, + typename out_t = scalar_t> + typename OpFunctor, + typename GeneralDispatcher> +static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) { + if (iter.dtype() == kHalf) { + return OpFunctor{}(iter); + } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) { + // type promotion that does cast and reduction in a single kernel + return OpFunctor{}(iter); + } else if (iter.dtype() == kBFloat16) { + return OpFunctor{}(iter); + } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { + // type promotion that does cast and reduction in a single kernel + return OpFunctor{}(iter); + } + op(iter); +} + template struct SumFunctor { inline acc_t operator()(acc_t a, acc_t b) const { @@ -36,22 +67,47 @@ struct sum_functor { }; void sum_kernel(TensorIterator& iter) { - if (iter.dtype() == kHalf) { - return sum_functor{}(iter); - } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) { - // type promotion that does cast and reduction in a single kernel - return sum_functor{}(iter); - } else if (iter.dtype() == kBFloat16) { - return sum_functor{}(iter); - } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { - // type promotion that does cast and reduction in a single kernel - return sum_functor{}(iter); + auto general_dispatcher = [](TensorIterator& iter) { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kBool, kComplexHalf, iter.dtype(), "sum_xpu", [&]() { + sum_functor{}(iter); + }); + }; + reduce_dispatch(iter, general_dispatcher); +} + +template < + typename scalar_t, + typename acc_t = scalar_t, + typename out_t = scalar_t> +struct NansumFunctor { + void operator()(TensorIterator& iter) { + gpu_reduce_kernel(iter, NanSumOps{}); } +}; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( - kBool, kComplexHalf, iter.dtype(), "sum_xpu", [&]() { - sum_functor{}(iter); +template +struct NansumComplexFunctor { + void operator()(TensorIterator& iter) { + using acc_t = at::opmath_type; + gpu_reduce_kernel(iter, NanSumOps{}); + } +}; + +void nansum_kernel(TensorIterator& iter) { + auto general_dispatcher = [](TensorIterator& iter) { + auto dtype = iter.dtype(); + if (at::isComplexType(dtype)) { + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "nansum_xpu", [&]() { + NansumComplexFunctor{}(iter); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_xpu", [&]() { + NansumFunctor{}(iter); }); + } + }; + reduce_dispatch(iter, general_dispatcher); } } // namespace xpu diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index a05d47e05..37f4ab220 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -127,6 +127,8 @@ "test_compare_cpu__batch_norm_with_update_xpu_bfloat16", "test_compare_cpu__batch_norm_with_update_xpu_float16", "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16", + "test_compare_cpu_nansum_xpu_bfloat16", + "test_compare_cpu_nanmean_xpu_bfloat16", # Align with CUDA impl by using accumulate type. But CPU doesn't use. # When XPU uses original data type, the case passes. "test_compare_cpu_logit_xpu_bfloat16", diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 6d989fa14..e8c0dc55b 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -136,6 +136,7 @@ "atanh", "sqrt", "sum", + "nansum", "amin", "amax", "std", @@ -218,6 +219,7 @@ "copysign", "count_nonzero", "nan_to_num", + "nanmean", ] diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 224c0e7d8..92ab06e74 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -339,6 +339,8 @@ supported: - min.dim_min - sum.dim_IntList - sum.IntList_out + - nansum + - nansum.out - mean.out - mean.dim - std.correction From c1f99eaac5fda36d230d58a92489713706c0d2f1 Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Sat, 27 Jul 2024 16:46:03 +0800 Subject: [PATCH 09/17] Add aten::masked_select and its variants (#649) Task list: - [x] masked_select - [x] masked_select.out Fixes: #629 --- src/ATen/native/xpu/Indexing.cpp | 54 ++++++++++++++++++++++- src/ATen/native/xpu/TensorTopK.cpp | 5 ++- src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/RepeatKernel.cpp | 6 +-- test/xpu/run_test_with_skip.py | 1 - test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 2 + 7 files changed, 59 insertions(+), 11 deletions(-) diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp index 42321e7b7..5db9a7238 100644 --- a/src/ATen/native/xpu/Indexing.cpp +++ b/src/ATen/native/xpu/Indexing.cpp @@ -1,10 +1,11 @@ #include +#include #include +#include #include #include -#include - #include +#include #include namespace at { @@ -44,4 +45,53 @@ Tensor XPUNativeFunctions::index_select( return index_select_out(self, dim, index, out); } +static Tensor& masked_select_out_impl( + Tensor& result, + const Tensor& self, + const Tensor& mask) { + NoNamesGuard guard; + + TORCH_CHECK( + mask.scalar_type() == ScalarType::Bool, + "masked_select: expected BoolTensor for mask"); + TORCH_CHECK( + self.scalar_type() == result.scalar_type(), + "masked_select(): self and result must have the same scalar type"); + + auto mask_temp = (mask.dim() == 0) + ? c10::MaybeOwned::owned(mask.unsqueeze(0)) + : c10::MaybeOwned::borrowed(mask); + auto self_temp = (self.dim() == 0) + ? c10::MaybeOwned::owned(self.unsqueeze(0)) + : c10::MaybeOwned::borrowed(self); + + // Cannot reassign to mask_temp and self_temp here! if they are + // owning and expand_outplace returns a borrow, the returned borrow + // would dangle. + auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp); + XPUNativeFunctions::index_out( + *std::get<1>(mask_self_expanded), + c10::List>( + {*std::move(std::get<0>(mask_self_expanded))}), + result); + + return result; +} + +Tensor XPUNativeFunctions::masked_select( + const Tensor& self, + const Tensor& mask) { + namedinference::compute_broadcast_outnames(self, mask); + Tensor result = at::empty({0}, self.options()); + return masked_select_out_impl(result, self, mask); +} + +Tensor& XPUNativeFunctions::masked_select_out( + const Tensor& self, + const Tensor& mask, + Tensor& result) { + namedinference::compute_broadcast_outnames(self, mask); + return masked_select_out_impl(result, self, mask); +} + } // namespace at diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp index a9c5836a7..3961160bf 100644 --- a/src/ATen/native/xpu/TensorTopK.cpp +++ b/src/ATen/native/xpu/TensorTopK.cpp @@ -55,8 +55,9 @@ void topk_out_impl( TORCH_CHECK( k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), "selected index k out of range"); - - // If k is 0 the result is an empty tensor, so we don't need to launch a kernel. + + // If k is 0 the result is an empty tensor, so we don't need to launch a + // kernel. if (k == 0) { return; } diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index b8969548c..7bcc67ca5 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -225,7 +225,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "logspace.out", "lu_unpack.out", "masked_scatter_", - "masked_select", "max_pool3d_with_indices", "max_pool3d_with_indices_backward", "max_unpool2d", diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp index c3e27ab62..0b10f982a 100644 --- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp +++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp @@ -51,11 +51,7 @@ static void compute_xpu( return; auto kfn = RepeatInterleaveKernelFunctor( - repeat_ptr, - cumsum_ptr, - result_ptr, - size, - result_size); + repeat_ptr, cumsum_ptr, result_ptr, size, result_size); int64_t wg_size = syclMaxWorkGroupSize(kfn); int64_t local_range = size < wg_size ? size : wg_size; diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 313a2e8b3..a1995c5af 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -53,7 +53,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_compare_cpu_to_sparse_xpu_float32", "test_errors_dot_xpu", "test_errors_kthvalue_xpu", - "test_errors_masked_select_xpu", "test_errors_sparse_mul_layout0_xpu", "test_errors_sparse_mul_layout1_xpu", "test_errors_sparse_mul_layout2_xpu", diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index e8c0dc55b..9b3ee286b 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -80,6 +80,7 @@ "index_fill", "index_put", "index_select", + "masked_select", "isin", "isnan", "le", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 92ab06e74..75a40f54f 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -112,6 +112,8 @@ supported: - index_fill_.int_Tensor - index_select - index_select.out + - masked_select + - masked_select.out - gcd - gcd.out - gcd_ From eb223b2e29bb3e35fef34fda7775c31f7cc5101f Mon Sep 17 00:00:00 2001 From: hjhee Date: Sat, 27 Jul 2024 16:46:57 +0800 Subject: [PATCH 10/17] erfinv: Sharing erfinv scalar functor to align precision with other PyTorch backends. (#634) Use existing `calc_erfinv` implementation from `` reduces the discrepancy between CPU MKL and XPU implementations of the erfinv for specific test cases in float64 --- .../xpu/sycl/UnarySpecialOpsKernels.cpp | 65 ++----------------- test/xpu/run_test_with_skip.py | 20 ------ 2 files changed, 7 insertions(+), 78 deletions(-) diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index b7d0b8974..e2303634e 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -79,67 +79,16 @@ void erfc_kernel(TensorIteratorBase& iter) { template struct ErfinvFunctor { - using opmath_type = at::opmath_type; - scalar_t operator()(scalar_t in) const { - scalar_t out; - opmath_type z, num, dem; - - auto x = static_cast(in); - if (std::fabs(x) > 1.0f) { - out = static_cast(NAN); - return out; - } - if (std::fabs(x) == 1.0f) { - out = static_cast( - (std::copysign(1.0, static_cast(x))) * - (std::numeric_limits::infinity())); - return out; - } - if (std::fabs(x) <= 0.7f) { - z = x * x; - num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]); - dem = - ((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z + - static_cast(1.0)); - out = x * num / dem; - } else { - z = static_cast( - std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0))); - num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0]; - dem = (d_[1] * z + d_[0]) * z + static_cast(1.0); - out = static_cast( - static_cast(std::copysign(1.0, static_cast(x))) * - num / dem); - } - out = out - - static_cast( - (std::erf(static_cast(out)) - x) / - ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x))); - out = out - - static_cast( - (std::erf(static_cast(out)) - x) / - ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x))); - return out; + return calc_erfinv(in); } +}; - static constexpr double PI_f64_ = 3.14159265358979323846; - static constexpr std::array a_ = { - 0.886226899, - -1.645349621, - 0.914624893, - -0.140543331}; - static constexpr std::array b_ = { - -2.118377725, - 1.442710462, - -0.329097515, - 0.012229801}; - static constexpr std::array c_ = { - -1.970840454, - -1.624906493, - 3.429567803, - 1.641345311}; - static constexpr std::array d_ = {3.543889200, 1.637067800}; +template <> +struct ErfinvFunctor { + c10::Half operator()(c10::Half in) const { + return calc_erfinv(float(in)); + } }; void erfinv_kernel(TensorIteratorBase& iter) { diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index a1995c5af..719af3ca4 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -1551,26 +1551,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): # Relative difference: 6.156719153309558e-06 (up to 1e-06 allowed) "test_log1p_complex_xpu_complex64", - # CPU MKL::erfinv vs XPU impl. At most 6.e-06 - # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed) - # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed) - "test_reference_numerics_large__refs_erfinv_xpu_float64", - # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed) - # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed) - "test_reference_numerics_large_erfinv_xpu_float64", - # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed) - # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed) - "test_reference_numerics_normal__refs_erfinv_xpu_float64", - # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed) - # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed) - "test_reference_numerics_normal_erfinv_xpu_float64", - # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed) - # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed) - "test_reference_numerics_small__refs_erfinv_xpu_float64", - # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed) - # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed) - "test_reference_numerics_small_erfinv_xpu_float64", - # Issue: https://github.com/intel/torch-xpu-ops/issues/622 # Mismatched elements: 8 / 943593 (0.0%) # Greatest absolute difference: inf at index (9, 860) (up to 0.001 allowed) From 0f6f4d394861c0f3d28e6b8a1e3ef158dc057d2c Mon Sep 17 00:00:00 2001 From: majing Date: Mon, 29 Jul 2024 10:40:47 +0800 Subject: [PATCH 11/17] Add aten::replication_pad1d/2d/3d and backward (#600) Signed-off-by: Ma, Jing1 Signed-off-by: majing Co-authored-by: Feng Yuan --- src/ATen/native/xpu/ReplicationPadding.cpp | 316 ++++++++ src/ATen/native/xpu/XPUFallback.template | 6 - .../xpu/sycl/ReplicationPaddingKernels.cpp | 743 ++++++++++++++++++ .../xpu/sycl/ReplicationPaddingKernels.h | 40 + test/xpu/extended/run_test_with_skip.py | 6 + test/xpu/xpu_test_utils.py | 2 +- yaml/xpu_functions.yaml | 12 + 7 files changed, 1118 insertions(+), 7 deletions(-) create mode 100644 src/ATen/native/xpu/ReplicationPadding.cpp create mode 100644 src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp create mode 100644 src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp new file mode 100644 index 000000000..b4f6d3272 --- /dev/null +++ b/src/ATen/native/xpu/ReplicationPadding.cpp @@ -0,0 +1,316 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +void replication_pad1d_meta( + Tensor& output, + const Tensor& input, + IntArrayRef paddingSize) { + TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); + + int64_t dimw = 1; + int64_t dimslices = 0; + int64_t nbatch = 1; + + int64_t pad_l = paddingSize[0]; + int64_t pad_r = paddingSize[1]; + + at::native::padding::check_valid_input<1>(input, paddingSize); + + if (input.ndimension() == 3) { + nbatch = input.size(0); + dimw++; + dimslices++; + } + + /* sizes */ + int64_t nslices = input.size(dimslices); + int64_t iwidth = input.size(dimw); + int64_t owidth = iwidth + pad_l + pad_r; + + TORCH_CHECK(owidth >= 1, + "input (W: ", iwidth, ") is too small." + " Calculated output W: ", owidth); + + if (output.defined()) { + if (input.ndimension() == 2) { + xpu::resize_out(output, {nslices, owidth}, {}, input.options()); + } else { + xpu::resize_out(output, {nbatch, nslices, owidth}, {}, input.options()); + } + } else { + if (input.ndimension() == 2) { + output = xpu::create_out({nslices, owidth}, {}, input.options()); + } else { + output = xpu::create_out({nbatch, nslices, owidth}, {}, input.options()); + } + } +} + +void replication_pad1d_backward_meta( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef paddingSize) { + int64_t dimw = 1; + TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); + int64_t pad_l = paddingSize[0]; + int64_t pad_r = paddingSize[1]; + + if (input.ndimension() == 3) { + dimw++; + } + + /* sizes */ + int64_t iwidth = input.size(dimw); + int64_t owidth = iwidth + pad_l + pad_r; + + TORCH_CHECK(owidth == grad_output.size(dimw), + "grad_output width unexpected. Expected: ", owidth, + " Got: ", grad_output.size(dimw)); + + if (grad_input.defined()) { + xpu::resize_out(grad_input, input.sizes(), {}, input.options()); + } else { + grad_input = xpu::create_out(input.sizes(), {}, input.options()); + } +} + +void replication_pad2d_meta( + Tensor& output, + const Tensor& input, + IntArrayRef paddingSize) { + TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); + int64_t pad_l = paddingSize[0]; + int64_t pad_r = paddingSize[1]; + int64_t pad_t = paddingSize[2]; + int64_t pad_b = paddingSize[3]; + int64_t dimw = 2; + int64_t dimh = 1; + int64_t dimslices = 0; + int64_t nbatch = 1; + + at::native::padding::check_valid_input<2>(input, paddingSize); + + if (input.dim() == 4) { + nbatch = input.size(0); + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + int64_t nslices = input.size(dimslices); + int64_t iheight = input.size(dimh); + int64_t iwidth = input.size(dimw); + int64_t oheight = iheight + pad_t + pad_b; + int64_t owidth = iwidth + pad_l + pad_r; + + TORCH_CHECK(owidth >= 1 || oheight >= 1, + "input (H: ", iheight, ", W: ", iwidth, " ) is too small." + " Calculated output H: ", oheight, " W: ", owidth); + + if (output.defined()) { + if (input.dim() == 3) { + xpu::resize_out( + output, + {nslices, oheight, owidth}, {}, input.options()); + } else { + xpu::resize_out( + output, {nbatch, nslices, oheight, owidth}, {}, input.options()); + } + } else { + if (input.dim() == 3) { + output = xpu::create_out( + {nslices, oheight, owidth}, {}, input.options()); + } else { + output = xpu::create_out( + {nbatch, nslices, oheight, owidth}, {}, input.options()); + } + } +} + +void replication_pad3d_meta( + Tensor& output, + const Tensor& input, + IntArrayRef paddingSize) { + TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); + int64_t pleft = paddingSize[0]; + int64_t pright = paddingSize[1]; + int64_t ptop = paddingSize[2]; + int64_t pbottom = paddingSize[3]; + int64_t pfront = paddingSize[4]; + int64_t pback = paddingSize[5]; + int64_t dimw = 3; + int64_t dimh = 2; + int64_t dimd = 1; + int64_t dimslices = 0; + int64_t nbatch = 1; + + at::native::padding::check_valid_input<3>(input, paddingSize); + + if (input.dim() == 5) { + nbatch = input.size(0); + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + int64_t nslices = input.size(dimslices); + int64_t idepth = input.size(dimd); + int64_t iheight = input.size(dimh); + int64_t iwidth = input.size(dimw); + int64_t odepth = idepth + pfront + pback; + int64_t oheight = iheight + ptop + pbottom; + int64_t owidth = iwidth + pleft + pright; + + TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1, + "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth, + ") is too small." + " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth); + + if (output.defined()) { + if (input.dim() == 4) { + xpu::resize_out( + output, + {nslices, odepth, oheight, owidth}, {}, input.options()); + } else { + xpu::resize_out( + output, {nbatch, nslices, odepth, oheight, owidth}, {}, input.options()); + } + } else { + if (input.dim() == 4) { + output = xpu::create_out( + {nslices, odepth, oheight, owidth}, {}, input.options()); + } else { + output = xpu::create_out( + {nbatch, nslices, odepth, oheight, owidth}, {}, input.options()); + } + } +} + +Tensor XPUNativeFunctions::replication_pad1d( + const Tensor& input, + IntArrayRef padding) { + Tensor output; + replication_pad1d_meta(output, input, padding); + native::xpu::replication_pad1d_kernel(output, input, padding); + return output; +} + +Tensor& XPUNativeFunctions::replication_pad1d_out( + const Tensor& input, + IntArrayRef padding, + Tensor& output) { + replication_pad1d_meta(output, input, padding); + native::xpu::replication_pad1d_kernel(output, input, padding); + return output; +} + +Tensor XPUNativeFunctions::replication_pad1d_backward( + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding) { + Tensor grad_input; + replication_pad1d_backward_meta(grad_input, grad_output, input, padding); + native::xpu::replication_pad1d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +Tensor& XPUNativeFunctions::replication_pad1d_backward_out( + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + Tensor& grad_input) { + replication_pad1d_backward_meta(grad_input, grad_output, input, padding); + native::xpu::replication_pad1d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +Tensor& XPUNativeFunctions::replication_pad2d_out( + const Tensor& input, + IntArrayRef padding, + Tensor& output) { + replication_pad2d_meta(output, input, padding); + native::xpu::replication_pad2d_kernel(output, input, padding); + return output; +} + +Tensor XPUNativeFunctions::replication_pad2d( + const Tensor& input, + IntArrayRef padding) { + Tensor output; + replication_pad2d_meta(output, input, padding); + native::xpu::replication_pad2d_kernel(output, input, padding); + return output; +} + +Tensor& XPUNativeFunctions::replication_pad2d_backward_out( + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + Tensor& grad_input) { + native::xpu::replication_pad2d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +Tensor XPUNativeFunctions::replication_pad2d_backward( + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding) { + auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + native::xpu::replication_pad2d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +Tensor XPUNativeFunctions::replication_pad3d( + const Tensor& input, + IntArrayRef padding) { + Tensor output; + replication_pad3d_meta(output, input, padding); + native::xpu::replication_pad3d_kernel(output, input, padding); + return output; +} + +Tensor& XPUNativeFunctions::replication_pad3d_out( + const Tensor& input, + IntArrayRef padding, + Tensor& output) { + replication_pad3d_meta(output, input, padding); + native::xpu::replication_pad3d_kernel(output, input, padding); + return output; +} + +Tensor XPUNativeFunctions::replication_pad3d_backward( + const Tensor& grad_output, + const Tensor& input, + at::IntArrayRef padding) { + auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + native::xpu::replication_pad3d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +Tensor& XPUNativeFunctions::replication_pad3d_backward_out( + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + Tensor& grad_input) { + native::xpu::replication_pad3d_backward_kernel( + grad_input, grad_output, input, padding); + return grad_input; +} + +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 7bcc67ca5..3f2064653 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -246,10 +246,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "prod", "prod.int_out", "put_", - "replication_pad1d_backward.grad_input", - "replication_pad1d.out", - "replication_pad2d_backward", - "replication_pad2d.out", "round.decimals_out", "round.out", "rrelu_with_noise", @@ -318,8 +314,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "vdot", "xlogy.OutTensor", "_upsample_bicubic2d_aa.out", - "replication_pad3d.out", - "replication_pad3d_backward", }; for (auto& op_name : fallback_list) { m.impl( diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp new file mode 100644 index 000000000..bb42aa327 --- /dev/null +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp @@ -0,0 +1,743 @@ +#pragma clang diagnostic push +#pragma GCC diagnostic push +// Avoid SYCL compiler return-type error +#pragma clang diagnostic ignored "-Wreturn-type" +#pragma GCC diagnostic ignored "-Wreturn-type" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native::xpu { + +inline int imin(int a, int b) { + return a > b ? b : a; +} +inline int imax(int a, int b) { + return a > b ? a : b; +} + +template +struct ParallelReplicationPad1dKernelFunctor { + void operator()(sycl::nd_item<3> item) const { + auto output_id = item.get_global_id(2); + if (output_id < output_plane_size_) { + int64_t output_x = output_id % output_.size(2); + int64_t i_start_x = imax(0, -pad_left_); + int64_t o_start_x = imax(0, pad_left_); + int64_t input_x = + imin(imax(pad_left_, output_x), input_.size(2) + pad_left_ - 1) - + o_start_x + i_start_x; + + f_(input_, output_, item.get_group(1), item.get_group(0), output_x, input_x); + } + } + ParallelReplicationPad1dKernelFunctor( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_right, + const F f, + int64_t output_plane_size) + : input_(input), + output_(output), + pad_left_(pad_left), + pad_right_(pad_right), + f_(f), + output_plane_size_(output_plane_size) {} + + private: + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; + int64_t pad_left_; + int64_t pad_right_; + const F f_; + int64_t output_plane_size_; +}; + +template +void parallel_replication_pad1d( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_right, + const F& f) { + auto queue = getCurrentSYCLQueue(); + int64_t output_plane_size = output.size(2); + + ParallelReplicationPad1dKernelFunctor kfn( + input, output, pad_left, pad_right, f, output_plane_size); + + int64_t work_group_size = syclMaxWorkGroupSize(kfn); + int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size); + int64_t nplane = output.size(1); + int64_t nbatch = output.size(0); + + sycl_kernel_submit( + sycl::range<3>(nbatch, nplane, work_group_size * work_group_num), + sycl::range<3>(1, 1, work_group_size), + queue, + kfn); +} + +template +struct ReplicationPad1dForwardFunctor { + void operator()( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t plane, + int64_t batch, + int64_t output_x, + int64_t intput_x) const { + auto value_to_copy = input[batch][plane][intput_x]; + output[batch][plane][output_x] = value_to_copy; + } +}; + +template +void replication_pad1d_forward_template( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_right) { + ReplicationPad1dForwardFunctor f; + parallel_replication_pad1d(input, output, pad_left, pad_right, f); +} + +template +struct ReplicationPad1dBackwardFunctor { + void operator()( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + int64_t plane, + int64_t batch, + int64_t output_x, + int64_t intput_x) const { + auto value_to_add = grad_output[batch][plane][output_x]; + auto target = + (sycl_global_ptr)&grad_input[batch][plane][intput_x]; + atomicAdd(target, value_to_add); + } +}; + +template +void replication_pad1d_backward_template( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + int64_t pad_left, + int64_t pad_right) { + ReplicationPad1dBackwardFunctor f; + parallel_replication_pad1d(grad_input, grad_output, pad_left, pad_right, f); +} + +template +struct ParallelReplicationPad2dKernelFunctor { + void operator()(sycl::nd_item<3> item) const { + const int output_id = item.get_global_id(2); + const int batch = item.get_global_id(0); + const int plane = item.get_global_id(1); + + if (output_id < output_.size(2) * output_.size(3)) { + const int output_x = output_id / output_.size(3); // height + const int output_y = output_id % output_.size(3); // width + + const int iStartX = imax(0, -padT_); + const int iStartY = imax(0, -padL_); + const int oStartX = imax(0, padT_); + const int oStartY = imax(0, padL_); + + const int input_x = imin(imax(padT_, output_x), input_.size(2) + padT_ - 1) - oStartX + iStartX; + const int input_y = imin(imax(padL_, output_y), input_.size(3) + padL_ - 1) - oStartY + iStartY; + + f_(input_, output_, batch, plane, input_x, input_y, output_x, output_y); + } +} + ParallelReplicationPad2dKernelFunctor( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t padT, + int64_t padL, + const F f) + : input_(input), + output_(output), + padT_(padT), + padL_(padL), + f_(f) {} + + private: + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; + int64_t padT_; + int64_t padL_; + const F f_; +}; + +template +void parallel_replication_pad2d( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + const int padT, + const int padL, + const F& f) { + auto queue = getCurrentSYCLQueue(); + int64_t output_plane_size = output.size(2) * output.size(3); + + ParallelReplicationPad2dKernelFunctor kfn( + input, output, padT, padL, f); + + int64_t work_group_size = syclMaxWorkGroupSize(kfn); + int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size); + int64_t nplane = output.size(1); + int64_t nbatch = output.size(0); + + sycl_kernel_submit( + sycl::range<3>(nbatch, nplane, work_group_size * work_group_num), + sycl::range<3>(1, 1, work_group_size), + queue, + kfn); +} + +template +struct ReplicationPad2dForwardFunctor { + void operator()( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t batch, + int64_t plane, + int64_t input_x, + int64_t input_y, + int64_t output_x, + int64_t output_y) const { + scalar_t valueToCopy = input[batch][plane][input_x][input_y]; + output[batch][plane][output_x][output_y] = valueToCopy; + } +}; + +template +void replication_pad2d_forward_template( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t padT, + int64_t padL) { + ReplicationPad2dForwardFunctor f; + parallel_replication_pad2d(input, output, padT, padL, f); +} + +template +struct ReplicationPad2dBackwardFunctor { + void operator()( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + int64_t batch, + int64_t plane, + int64_t input_x, + int64_t input_y, + int64_t output_x, + int64_t output_y) const { + scalar_t valueToAdd = grad_output[batch][plane][output_x][output_y]; + auto target = + (sycl_global_ptr)&grad_input[batch][plane][input_x][input_y]; + atomicAdd(target, valueToAdd); + } +}; + +template +void replication_pad2d_backward_template( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + const int padT, + const int padL) { + ReplicationPad2dBackwardFunctor f; + parallel_replication_pad2d(grad_input, grad_output, padT, padL, f); +} + +template +struct ParallelReplicationPad3dKernelFunctor { + void operator()(sycl::nd_item<3> item) const { + auto output_id = item.get_global_id(2); + if (output_id < output_plane_size_) { + int64_t output_x = output_id % output_.size(4); + int64_t output_y = (output_id / output_.size(4)) % output_.size(3); + int64_t output_z = output_id / (output_.size(3) * output_.size(4)); + + int64_t i_start_x = imax(0, -pad_left_); + int64_t i_start_y = imax(0, -pad_top_); + int64_t i_start_z = imax(0, -pad_front_); + int64_t o_start_x = imax(0, pad_left_); + int64_t o_start_y = imax(0, pad_top_); + int64_t o_start_z = imax(0, pad_front_); + + int64_t input_x = + imin(imax(pad_left_, output_x), input_.size(4) + pad_left_ - 1) - + o_start_x + i_start_x; + int64_t input_y = + imin(imax(pad_top_, output_y), input_.size(3) + pad_top_ - 1) - o_start_y + + i_start_y; + int64_t input_z = + imin(imax(pad_front_, output_z), input_.size(2) + pad_front_ - 1) - + o_start_z + i_start_z; + + f_(input_, + output_, + item.get_group(1), + item.get_group(0), + output_z, + output_y, + output_x, + input_z, + input_y, + input_x); + } + } + ParallelReplicationPad3dKernelFunctor( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_top, + int64_t pad_front, + const F f, + int64_t output_plane_size) + : input_(input), + output_(output), + pad_left_(pad_left), + pad_top_(pad_top), + pad_front_(pad_front), + f_(f), + output_plane_size_(output_plane_size) {} + + private: + PackedTensorAccessor64 input_; + PackedTensorAccessor64 output_; + int64_t pad_left_; + int64_t pad_top_; + int64_t pad_front_; + const F f_; + int64_t output_plane_size_; +}; + +template +void parallel_replication_pad3d( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_top, + int64_t pad_front, + const F& f) { + auto queue = getCurrentSYCLQueue(); + int64_t output_plane_size = output.size(2) * output.size(3) * output.size(4); + + ParallelReplicationPad3dKernelFunctor kfn( + input, output, pad_left, pad_top, pad_front, f, output_plane_size); + int64_t work_group_size = syclMaxWorkGroupSize(kfn); + int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size); + int64_t nplane = output.size(1); + int64_t nbatch = output.size(0); + + sycl_kernel_submit( + sycl::range<3>(nbatch, nplane, work_group_size * work_group_num), + sycl::range<3>(1, 1, work_group_size), queue, + kfn); +} + +template +struct ReplicationPad3dForwardFunctor { + void operator()( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t plane, + int64_t batch, + int64_t output_z, + int64_t output_y, + int64_t output_x, + int64_t intput_z, + int64_t intput_y, + int64_t intput_x) const { + auto value_to_copy = input[batch][plane][intput_z][intput_y][intput_x]; + output[batch][plane][output_z][output_y][output_x] = value_to_copy; + } +}; + +template +void replication_pad3d_forward_template( + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, + int64_t pad_left, + int64_t pad_top, + int64_t pad_front) { + ReplicationPad3dForwardFunctor f; + parallel_replication_pad3d(input, output, pad_left, pad_top, pad_front, f); +} + +template +struct ReplicationPad3dBackwardFunctor { + void operator()( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + int64_t plane, + int64_t batch, + int64_t output_z, + int64_t output_y, + int64_t output_x, + int64_t intput_z, + int64_t intput_y, + int64_t intput_x) const { + auto value_to_add = grad_output[batch][plane][output_z][output_y][output_x]; + auto target = + (sycl_global_ptr)&grad_input[batch][plane][intput_z][intput_y][intput_x]; + atomicAdd(target, value_to_add); + } +}; + +template +void replication_pad3d_backward_template( + PackedTensorAccessor64 grad_input, + PackedTensorAccessor64 grad_output, + int64_t pad_left, + int64_t pad_top, + int64_t pad_front) { + ReplicationPad3dBackwardFunctor f; + parallel_replication_pad3d( + grad_input, grad_output, pad_left, pad_top, pad_front, f); +} + +void replication_pad1d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding) { + TORCH_CHECK(input.numel() < std::numeric_limits::max(), + "replication_pad1d only supports input tensors with less than 2^63 - 1 elements"); + + if (input.numel() == 0) { + return; + } + + int64_t pad_left = padding[0]; + int64_t pad_right = padding[1]; + int64_t num_input_dims = input.dim(); + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input.scalar_type(), "replication_pad1d_xpu", [&] { + auto input_ = input; + auto output_ = output; + if (num_input_dims == 2) { + input_ = input.unsqueeze(0); + output_ = output.unsqueeze(0); + } + + auto input_packed = input_.packed_accessor64(); + auto output_packed = output_.packed_accessor64(); + + replication_pad1d_forward_template( + input_packed, output_packed, pad_left, pad_right); + }); +} + +void replication_pad1d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding) { + // See Note [Writing Nondeterministic Operations] + // Nondeterministic because of atomicAdd usage + globalContext().alertNotDeterministic("replication_pad1d_backward_xpu"); + + TORCH_CHECK(input.numel() < std::numeric_limits::max(), + "replication_pad1d only supports input tensors with less than 2^63 - 1 elements"); + TORCH_CHECK(grad_output.numel() < std::numeric_limits::max(), + "replication_pad1d only supports output tensors with less than 2^63 - 1 elements"); + + if (grad_input.numel() == 0) { + return; + } + grad_input.zero_(); + + int pad_left = padding[0]; + int pad_right = padding[1]; + int num_input_dims = input.ndimension(); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + kHalf, + kBFloat16, + input.scalar_type(), + "replication_pad1d_backward_xpu", + [&] { + auto grad_input_ = grad_input; + auto grad_output_ = grad_output; + if (num_input_dims == 2) { + grad_input_ = grad_input.unsqueeze(0); + grad_output_ = grad_output.unsqueeze(0); + } + auto grad_input_packed = grad_input_.packed_accessor64(); + auto grad_output_packed = grad_output_.packed_accessor64(); + + replication_pad1d_backward_template( + grad_input_packed, grad_output_packed, pad_left, pad_right); + }); +} + +void replication_pad2d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding) { + TORCH_CHECK(canUse32BitIndexMath(input), + "input tensor must fit into 32-bit index math"); + if (input.numel() == 0) { + return; + } + const auto padL = padding[0]; + const auto padT = padding[2]; + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, + input.scalar_type(), "replication_pad2d_xpu", [&] { + Tensor input_ = input; + Tensor output_ = output; + if (input.dim() == 3) { + input_ = input.unsqueeze(0); + output_ = output.unsqueeze(0); + } + auto devInput = input_.packed_accessor64(); + auto devOutput = output_.packed_accessor64(); + replication_pad2d_forward_template(devInput, devOutput, padT, padL); + } + ); +} + +void replication_pad2d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding) { + // See Note [Writing Nondeterministic Operations] + // Nondeterministic because of atomicAdd usage + globalContext().alertNotDeterministic("replication_pad2d_backward_xpu"); + TORCH_CHECK(canUse32BitIndexMath(input), + "input tensor must fit into 32-bit index math"); + TORCH_CHECK(canUse32BitIndexMath(grad_output), + "output gradient tensor must fit into 32-bit index math"); + TORCH_CHECK(padding.size() == 4, "padding Size is expected to be 4"); + + const auto padL = padding[0]; + const auto padR = padding[1]; + const auto padT = padding[2]; + const auto padB = padding[3]; + int dimh = 1; + int dimw = 2; + + int numInputDims = input.dim(); + if (numInputDims == 4) { + dimh++; + dimw++; + } + const auto iheight = input.size(dimh); + const auto iwidth = input.size(dimw); + const auto oheight = iheight + padT + padB; + const auto owidth = iwidth + padL + padR; + + TORCH_CHECK(owidth == grad_output.size(dimw), + "grad_output width unexpected. Expected: ", owidth, ", Got: ", + grad_output.size(dimw)); + TORCH_CHECK(oheight == grad_output.size(dimh), + "grad_output height unexpected. Expected: ", oheight, ", Got: ", + grad_output.size(dimh)); + + grad_input.resize_as_(input); + if (grad_input.numel() == 0) { + return; + } + grad_input.zero_(); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, + input.scalar_type(), "replication_pad2d_backward_xpu", [&] { + + auto grad_input_ = grad_input; + auto grad_output_ = grad_output; + if (numInputDims == 3) { + grad_input_ = grad_input.unsqueeze(0); + grad_output_ = grad_output.unsqueeze(0); + } + auto grad_input_packed = grad_input_.packed_accessor64(); + auto grad_output_packed = grad_output_.packed_accessor64(); + + replication_pad2d_backward_template(grad_input_packed, grad_output_packed, padT, padL); + } + ); +} + +void replication_pad3d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding) { + if (input.numel() == 0) { + return; + } + int64_t pad_left = padding[0]; + int64_t pad_top = padding[2]; + int64_t pad_front = padding[4]; + + int64_t num_input_dims = input.dim(); + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input.scalar_type(), "replication_pad3d_xpu", [&] { + auto input_ = input; + auto output_ = output; + if (num_input_dims == 4) { + input_ = input.unsqueeze(0); + output_ = output.unsqueeze(0); + } + + auto input_packed = input_.packed_accessor64(); + auto output_packed = output_.packed_accessor64(); + + replication_pad3d_forward_template( + input_packed, output_packed, pad_left, pad_top, pad_front); + }); +} + +static inline void shapeAndGradOutputCheck3d( + const Tensor& input, + const Tensor& grad_output, + int64_t pad_left, + int64_t pad_right, + int64_t pad_top, + int64_t pad_bottom, + int64_t pad_front, + int64_t pad_back) { + TORCH_CHECK(canUse32BitIndexMath(input), + "input tensor must fit into 32-bit index math"); + int64_t num_input_dims = input.dim(); + + bool valid_dims = + input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0; + TORCH_CHECK( + (num_input_dims == 4 && valid_dims) || + (num_input_dims == 5 && valid_dims && input.size(4) != 0), + "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); + + int plane_dim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + if (num_input_dims == 5) { + plane_dim++; + dimd++; + dimh++; + dimw++; + } + + int64_t num_planes = input.size(plane_dim); + int64_t idepth = input.size(dimd); + int64_t iheight = input.size(dimh); + int64_t iwidth = input.size(dimw); + int64_t odepth = idepth + pad_front + pad_back; + int64_t oheight = iheight + pad_top + pad_bottom; + int64_t owidth = iwidth + pad_left + pad_right; + TORCH_CHECK( + owidth >= 1 || oheight >= 1 || odepth >= 1, + "input (D: ", + idepth, + " H: ", + iheight, + ", W: ", + iwidth, + ") is too small." + " Calculated output D: ", + odepth, + " H: ", + oheight, + " W: ", + owidth); + + TORCH_CHECK(canUse32BitIndexMath(grad_output), + "output gradient tensor must fit into 32-bit index math"); + + TORCH_CHECK( + num_planes == grad_output.size(plane_dim), + "grad_output width unexpected. Expected: ", + num_planes, + ", Got: ", + grad_output.size(plane_dim)); + TORCH_CHECK( + owidth == grad_output.size(dimw), + "grad_output width unexpected. Expected: ", + owidth, + ", Got: ", + grad_output.size(dimw)); + TORCH_CHECK( + oheight == grad_output.size(dimh), + "grad_output height unexpected. Expected: ", + oheight, + ", Got: ", + grad_output.size(dimh)); + TORCH_CHECK( + odepth == grad_output.size(dimd), + "grad_output depth unexpected. Expected: ", + odepth, + ", Got: ", + grad_output.size(dimd)); +} + +void replication_pad3d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding) { + // See Note [Writing Nondeterministic Operations] + // Nondeterministic because of atomicAdd usage + globalContext().alertNotDeterministic("replication_pad3d_backward_xpu"); + TORCH_CHECK(padding.size() == 6, "padding Size is expected to be 6"); + + int pad_left = padding[0]; + int pad_right = padding[1]; + int pad_top = padding[2]; + int pad_bottom = padding[3]; + int pad_front = padding[4]; + int pad_back = padding[5]; + shapeAndGradOutputCheck3d( + input, + grad_output, + pad_left, + pad_right, + pad_top, + pad_bottom, + pad_front, + pad_back); + + grad_input.resize_as_(input); + if (grad_input.numel() == 0) { + return; + } + grad_input.zero_(); + int num_input_dims = input.dim(); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + kHalf, + kBFloat16, + input.scalar_type(), + "replication_pad3d_backward_xpu", + [&] { + auto grad_input_ = grad_input; + auto grad_output_ = grad_output; + if (num_input_dims == 4) { + grad_input_ = grad_input.unsqueeze(0); + grad_output_ = grad_output.unsqueeze(0); + } + auto grad_input_packed = grad_input_.packed_accessor64(); + auto grad_output_packed = grad_output_.packed_accessor64(); + replication_pad3d_backward_template( + grad_input_packed, + grad_output_packed, + pad_left, + pad_top, + pad_front); + }); +} + +} // namespace at::native::xpu + +#pragma GCC diagnostic pop +#pragma clang diagnostic pop \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h new file mode 100644 index 000000000..85d19d5bf --- /dev/null +++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h @@ -0,0 +1,40 @@ +#pragma once + +#include + +namespace at::native::xpu { + +void replication_pad1d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding); + +void replication_pad1d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding); + +void replication_pad2d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding); + +void replication_pad2d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding); + +void replication_pad3d_kernel( + Tensor& output, + const Tensor& input, + IntArrayRef padding); + +void replication_pad3d_backward_kernel( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding); + +} // namespace at::native::xpu \ No newline at end of file diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 37f4ab220..6f8fe8d3a 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -170,6 +170,12 @@ # https://github.com/pytorch/pytorch/issues/130916 "test_compare_cpu_histogram_xpu_float32", "test_compare_cpu_histogram_xpu_float64", + + # Precision error. + # Mismatched elements: 2 / 125 (1.6%) + # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed) + # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed) + "test_compare_cpu_cumprod_xpu_bfloat16", ) diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 9b3ee286b..2a36ddfb0 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -59,7 +59,7 @@ "clamp_min", "clone", "copy", - "cumprod" + "cumprod", "cumsum", "equal", "eq", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 75a40f54f..77d77d4f0 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -564,6 +564,18 @@ supported: - reflection_pad3d.out - reflection_pad3d_backward - reflection_pad3d_backward.grad_input + - replication_pad1d + - replication_pad1d.out + - replication_pad1d_backward + - replication_pad1d_backward.grad_input + - replication_pad2d + - replication_pad2d.out + - replication_pad2d_backward + - replication_pad2d_backward.grad_input + - replication_pad3d + - replication_pad3d.out + - replication_pad3d_backward + - replication_pad3d_backward.grad_input - native_group_norm - native_group_norm_backward - elu From 0608225e2b9953a7dc8c20d57cb3b100fc3140bd Mon Sep 17 00:00:00 2001 From: Stonepia Date: Mon, 29 Jul 2024 11:20:03 +0800 Subject: [PATCH 12/17] Use safe API to access Tensor data pointer (#655) The un-templated data_ptr() does not check about the type check and storage_initialized() check. Thus, it may introduce potential bugs. By using the template data_ptr, this could throw an error like below: ``` torch._dynamo.exc.TorchRuntimeError: Failed running call_function torchvision.roi_align(*(FakeTensor(..., device='xpu:0', size=(1, 1024, 50, 75), dtype=torch.bfloat16), FakeTensor(..., device='xpu:0', size=(1000, 5), dtype=torch.bfloat16), 0.0625, 14, 14, 0, True), **{}): The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory. ``` --- src/ATen/native/xpu/NMS.cpp | 5 ++-- src/ATen/native/xpu/sycl/MultiTensorApply.h | 25 +++++++++++-------- src/ATen/native/xpu/sycl/NMSKernel.cpp | 4 +-- src/ATen/native/xpu/sycl/Shape.cpp | 4 +-- src/ATen/native/xpu/sycl/SoftMaxKernels.cpp | 10 ++++---- .../native/xpu/sycl/TriangularOpsKernels.cpp | 4 +-- 6 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/ATen/native/xpu/NMS.cpp b/src/ATen/native/xpu/NMS.cpp index 37ed19d50..ea1ac4e9c 100644 --- a/src/ATen/native/xpu/NMS.cpp +++ b/src/ATen/native/xpu/NMS.cpp @@ -48,14 +48,15 @@ Tensor nms(const Tensor& dets, const Tensor& scores, double iou_threshold_) { auto mask = nms_kernel(dets_sorted, iou_threshold); at::Tensor mask_cpu = mask.to(at::kCPU); - unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr(); + unsigned long long* mask_host = + (unsigned long long*)mask_cpu.mutable_data_ptr(); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU)); - int64_t* keep_out = (int64_t*)keep.data_ptr(); + int64_t* keep_out = keep.mutable_data_ptr(); int num_to_keep = 0; for (int i = 0; i < dets_num; i++) { diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h index 0817e40be..bb1abe277 100644 --- a/src/ATen/native/xpu/sycl/MultiTensorApply.h +++ b/src/ATen/native/xpu/sycl/MultiTensorApply.h @@ -164,7 +164,7 @@ void multi_tensor_apply( tensor_lists[0][0].options().dtype(at::kByte)); auto metaAddressInput = static_cast*>( - addressStorage.data_ptr()); + addressStorage.mutable_data_ptr()); TLMetaForAddressScalar* tlAddress = nullptr; auto tlAddress_dptr = at::xpu::HostAlloc( @@ -180,7 +180,7 @@ void multi_tensor_apply( tlAddress[t].scalar_vals = scalars[t].to(); totalWG += (numel + kChunkSize - 1) / kChunkSize; for (int d = 0; d < depth; ++d) { - tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr(); + tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr(); } } @@ -196,7 +196,8 @@ void multi_tensor_apply( auto wgMetaStorage = at::empty( {(int)(sizeof(TLMetaForWG) * totalWG)}, tensor_lists[0][0].options().dtype(at::kByte)); - auto metaWGInput = static_cast(wgMetaStorage.data_ptr()); + auto metaWGInput = + static_cast(wgMetaStorage.mutable_data_ptr()); TLMetaForWG* tlWGMeta = nullptr; auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG); @@ -248,7 +249,7 @@ void multi_tensor_apply( {(int)(sizeof(TLMetaForAddress) * n_tensors)}, tensor_lists[0][0].options().dtype(at::kByte)); auto metaAddressInput = - static_cast*>(addressStorage.data_ptr()); + static_cast*>(addressStorage.mutable_data_ptr()); TLMetaForAddress* tlAddress = nullptr; auto tlAddress_dptr = @@ -262,7 +263,7 @@ void multi_tensor_apply( tlAddress[t].numel_to_tensor = numel; totalWG += (numel + kChunkSize - 1) / kChunkSize; for (int d = 0; d < depth; ++d) { - tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr(); + tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr(); } } @@ -278,7 +279,8 @@ void multi_tensor_apply( auto wgMetaStorage = at::empty( {(int)(sizeof(TLMetaForWG) * totalWG)}, tensor_lists[0][0].options().dtype(at::kByte)); - auto metaWGInput = static_cast(wgMetaStorage.data_ptr()); + auto metaWGInput = + static_cast(wgMetaStorage.mutable_data_ptr()); TLMetaForWG* tlWGMeta = nullptr; auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG); @@ -324,8 +326,8 @@ void multi_tensor_apply_for_fused_optimizer( auto addressStorage = at::empty( {(int)(sizeof(TLFusedMetaForAddress) * n_tensors)}, tensor_lists[0][0].options().dtype(at::kByte)); - auto metaFusedAddressInput = - static_cast*>(addressStorage.data_ptr()); + auto metaFusedAddressInput = static_cast*>( + addressStorage.mutable_data_ptr()); TLFusedMetaForAddress* tlAddress = nullptr; auto tlAddress_dptr = @@ -337,10 +339,10 @@ void multi_tensor_apply_for_fused_optimizer( for (size_t t = 0; t < n_tensors; ++t) { auto numel = tensor_lists[0][t].numel(); tlAddress[t].numel_to_tensor = numel; - tlAddress[t].state_steps_addresses = state_steps[t].data_ptr(); + tlAddress[t].state_steps_addresses = state_steps[t].mutable_data_ptr(); totalWG += (numel + kChunkSize - 1) / kChunkSize; for (int d = 0; d < depth; ++d) { - tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr(); + tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr(); } } @@ -356,7 +358,8 @@ void multi_tensor_apply_for_fused_optimizer( auto wgMetaStorage = at::empty( {(int)(sizeof(TLMetaForWG) * totalWG)}, tensor_lists[0][0].options().dtype(at::kByte)); - auto metaWGInput = static_cast(wgMetaStorage.data_ptr()); + auto metaWGInput = + static_cast(wgMetaStorage.mutable_data_ptr()); TLMetaForWG* tlWGMeta = nullptr; auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG); diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp index 451555810..3bf712582 100644 --- a/src/ATen/native/xpu/sycl/NMSKernel.cpp +++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp @@ -111,8 +111,8 @@ Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold) { (size_t)col_blocks, (size_t)col_blocks * nms_items_per_group}; sycl::range<2> local_range{1, (size_t)nms_items_per_group}; using acc_t = acc_type_device; - auto dets_sorted_ptr = (scalar_t*)dets_sorted.data_ptr(); - auto mask_ptr = (unsigned long long*)mask.data_ptr(); + auto dets_sorted_ptr = dets_sorted.data_ptr(); + auto mask_ptr = (unsigned long long*)mask.data_ptr(); auto caller = NMSKernelFunctor( dets_num, iou_threshold, dets_sorted_ptr, mask_ptr); sycl_kernel_submit( diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp index 345d5078a..1c9c1b9d5 100644 --- a/src/ATen/native/xpu/sycl/Shape.cpp +++ b/src/ATen/native/xpu/sycl/Shape.cpp @@ -186,7 +186,7 @@ void parallel_cat( int nDims) { // First, let's set up our kernel parameters. We start with a raw pointer to // the storage for the output Tensor. - scalar_out_t* data = static_cast(out.data_ptr()); + scalar_out_t* data = static_cast(out.mutable_data_ptr()); // Kernel Parameter int64_t tensorMetadataSize = @@ -195,7 +195,7 @@ void parallel_cat( auto d_inputs_storage = at::empty({tensorMetadataSize}, out.options().dtype(at::kByte)); auto d_inputs = static_cast*>( - d_inputs_storage.data_ptr()); + d_inputs_storage.mutable_data_ptr()); OutputTensorSizeStride param; diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp index 57ebb9846..8db72165a 100644 --- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp +++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp @@ -1367,9 +1367,9 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) { using vec_t = at::native::memory::aligned_vector; constexpr int align_bytes = alignof(vec_t); int input_start = - ((uint64_t)input.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)input.const_data_ptr()) % align_bytes / sizeof(scalar_t); int output_start = - ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t); // decide indexing range: uint32_t (4GB) or uint64_t (>4GB) bool can_use_32bit_index = @@ -1558,11 +1558,11 @@ void spatial_softmax_backward( using vec_t = at::native::memory::aligned_vector; constexpr int align_bytes = alignof(vec_t); int gradin_start = - ((uint64_t)gradInput.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)gradInput.const_data_ptr()) % align_bytes / sizeof(scalar_t); int output_start = - ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t); int gradoutput_start = - ((uint64_t)gradOutput.data_ptr()) % align_bytes / sizeof(scalar_t); + ((uint64_t)gradOutput.const_data_ptr()) % align_bytes / sizeof(scalar_t); // decide indexing range: uint32_t (4GB) or uint64_t (>4GB) bool can_use_32bit_index = canUse32BitIndexMath(gradInput) && diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp index 07fd0be0a..65c165c2c 100644 --- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp @@ -83,8 +83,8 @@ void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) { IndexType result_stride_0 = (IndexType)result.stride(-2); IndexType result_stride_1 = (IndexType)result.stride(-1); - scalar_t* result_ptr = (scalar_t*)(result.data_ptr()); - scalar_t* self_ptr = (scalar_t*)(self.data_ptr()); + scalar_t* result_ptr = result.data_ptr(); + scalar_t* self_ptr = self.data_ptr(); ApplyTriuTrilKernelFunctor kfn( k, From e210c5cee1922c820643967d94f39cc30325ff23 Mon Sep 17 00:00:00 2001 From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:14:51 +0800 Subject: [PATCH 13/17] Enable aten::smooth_l1_loss forward/backward (#621) --- src/ATen/native/xpu/Loss.cpp | 58 +++++++++++++++++++ src/ATen/native/xpu/XPUFallback.template | 2 - .../native/xpu/sycl/BinaryMiscOpsKernels.cpp | 26 +++++++++ .../native/xpu/sycl/BinaryMiscOpsKernels.h | 2 + .../native/xpu/sycl/PointwiseOpsKernels.cpp | 34 +++++++++++ .../native/xpu/sycl/PointwiseOpsKernels.h | 2 + test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 3 + 8 files changed, 126 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp index f09f68b8a..050ff07b9 100644 --- a/src/ATen/native/xpu/Loss.cpp +++ b/src/ATen/native/xpu/Loss.cpp @@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out( return grad_input; } + +Tensor& XPUNativeFunctions::smooth_l1_loss_out( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& result) { + if (reduction != Reduction::None) { + TORCH_INTERNAL_ASSERT( + reduction == Reduction::Mean || reduction == Reduction::Sum); + result.resize_({}); + Tensor loss; + auto iter = TensorIterator::borrowing_binary_op(loss, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + if (reduction == Reduction::Mean) { + at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); + } else { + at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); + } + } else { + auto iter = TensorIterator::borrowing_binary_op(result, input, target); + native::xpu::smooth_l1_kernel(iter, beta); + } + return result; +} + +Tensor XPUNativeFunctions::smooth_l1_loss( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta) { + Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + result = XPUNativeFunctions::smooth_l1_loss_out( + input, target, reduction, beta, result); + return result; +} + +Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) { + auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; + auto iter = at::TensorIteratorConfig() + .add_output(grad_input) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) + .promote_inputs_to_common_dtype(true) + .cast_common_dtype_to_outputs(true) + .enforce_safe_casting_to_output(true) + .build(); + native::xpu::smooth_l1_backward_kernel(iter, norm, beta); + return grad_input; +} + Tensor XPUNativeFunctions::binary_cross_entropy( const Tensor& self, const Tensor& target, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 3f2064653..93321f23d 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -257,8 +257,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "signbit.out", "sign.out", "sinc.out", - "smooth_l1_loss_backward.grad_input", - "smooth_l1_loss.out", "special_airy_ai.out", "special_bessel_j0.out", "special_bessel_j1.out", diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp index 00c5398af..5ac71c163 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp @@ -23,6 +23,32 @@ void mse_kernel(TensorIteratorBase& iter) { [&]() { gpu_kernel(iter, MSEFunctor()); }); } +template +struct SmoothL1Functor { + scalar_t operator()(scalar_t input, scalar_t target) const { + auto z = std::abs(input - target); + return z < beta_val ? scalar_t(0.5) * z * z / beta_val + : z - scalar_t(0.5) * beta_val; + } + SmoothL1Functor(scalar_t beta_val) : beta_val(beta_val) {} + + private: + scalar_t beta_val; +}; + +void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "smooth_l1_xpu", + [&iter, beta]() { + scalar_t beta_val(beta); + SmoothL1Functor f(beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberFunctor { scalar_t operator()(scalar_t a, scalar_t b) const { diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h index 94cfb7c90..17672ec29 100644 --- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h +++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h @@ -6,6 +6,8 @@ namespace at::native::xpu { void mse_kernel(TensorIteratorBase& iter); +void smooth_l1_kernel(TensorIteratorBase& iter, double beta); + void huber_kernel(TensorIterator& iter, double delta); } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp index 5dc06e25a..822a83e99 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp @@ -125,6 +125,40 @@ void mse_backward_kernel(TensorIterator& iter, const Scalar& value) { }); } +template +struct SmoothL1BackwardFunctor { + scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) + const { + const auto x = input - target; + if (x < -beta_val) + return -norm_val * grad_output; + else if (x > beta_val) + return norm_val * grad_output; + else + return norm_val * x * grad_output / beta_val; + } + SmoothL1BackwardFunctor(scalar_t norm_val, scalar_t beta_val) + : norm_val(norm_val), beta_val(beta_val) {} + + private: + scalar_t norm_val; + scalar_t beta_val; +}; + +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) { + AT_DISPATCH_ALL_TYPES_AND2( + kHalf, + kBFloat16, + iter.dtype(), + "smooth_l1_backward_xpu", + [&iter, &norm, beta] { + auto norm_val = norm.to(); + scalar_t beta_val(beta); + SmoothL1BackwardFunctor f(norm_val, beta_val); + gpu_kernel(iter, f); + }); +} + template struct HuberBackwardFunctor { scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output) diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h index 586a64f3c..613c3cca6 100644 --- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h +++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h @@ -10,6 +10,8 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value); void mse_backward_kernel(TensorIterator& iter, const Scalar& value); +void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta); + void huber_backward_kernel( TensorIterator& iter, const Scalar& norm, diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 2a36ddfb0..c281747f2 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -179,6 +179,7 @@ "nn.functional.upsample_bilinear", "nn.functional.upsample_nearest", "nn.functional.nll_loss", + "nn.functional.smooth_l1_loss", "nn.functional.mse_loss", "nn.functional.binary_cross_entropy", "nn.functional.huber_loss", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 77d77d4f0..fd087c7bc 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -308,6 +308,9 @@ supported: - bitwise_and.Tensor_out - bitwise_or.Tensor_out - bitwise_xor.Tensor_out + - smooth_l1_loss + - smooth_l1_loss.out + - smooth_l1_loss_backward.grad_input - bitwise_not.out - where.self_out - where.self From 36dfe230dea6a737fe260b072276cbcca3ca3f9a Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:40:20 +0800 Subject: [PATCH 14/17] Add aten::polar and its variants (#606) Co-authored-by: yucai Co-authored-by: Feng Yuan --- src/ATen/native/xpu/TensorFactories.cpp | 15 +++++++++++++++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/ComplexKernels.cpp | 14 ++++++++++++++ src/ATen/native/xpu/sycl/ComplexKernels.h | 2 ++ test/xpu/extended/run_test_with_skip.py | 4 ++++ test/xpu/run_test_with_skip.py | 12 ++++++++++-- test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 1 + 8 files changed, 47 insertions(+), 3 deletions(-) diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp index 110590958..44da487f7 100644 --- a/src/ATen/native/xpu/TensorFactories.cpp +++ b/src/ATen/native/xpu/TensorFactories.cpp @@ -151,6 +151,21 @@ Tensor& XPUNativeFunctions::complex_out( return result; } +Tensor& XPUNativeFunctions::polar_out( + const Tensor& abs, + const Tensor& angle, + Tensor& result) { + complex_check_dtype(result, abs, angle); + auto iter = TensorIteratorConfig() + .add_output(result) + .add_const_input(abs) + .add_const_input(angle) + .check_all_same_dtype(false) + .build(); + native::xpu::polar_kernel(iter); + return result; +} + Tensor& XPUNativeFunctions::randperm_out( int64_t n, c10::optional generator, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 93321f23d..4a4c96828 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -240,7 +240,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "ormqr", "_pdist_backward", "_pdist_forward", - "polar.out", "_prelu_kernel", "_prelu_kernel_backward", "prod", diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp index 56b25d0ef..87504bd5e 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp +++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp @@ -21,4 +21,18 @@ void complex_kernel(TensorIterator& iter) { }); } +template +struct PolarFunctor { + c10::complex operator()(scalar_t a, scalar_t b) const { + return c10::complex(a * std::cos(b), a * std::sin(b)); + } +}; + +void polar_kernel(TensorIterator& iter) { + AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "polar_xpu", [&]() { + PolarFunctor f; + gpu_kernel(iter, f); + }); +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h index 990bcd14e..d51556b4f 100644 --- a/src/ATen/native/xpu/sycl/ComplexKernels.h +++ b/src/ATen/native/xpu/sycl/ComplexKernels.h @@ -6,4 +6,6 @@ namespace at::native::xpu { void complex_kernel(TensorIterator& iter); +void polar_kernel(TensorIterator& iter); + } // namespace at::native::xpu diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 6f8fe8d3a..a75d2e675 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -154,6 +154,10 @@ # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed) "test_compare_cpu_hypot_xpu_bfloat16", + # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. + # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. + "test_compare_cpu_polar_xpu_bfloat16", + # Regressions due to PyTorch uplift (Numeric difference in float and bfloat) # https://github.com/intel/torch-xpu-ops/issues/549 # Example fail log diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index 719af3ca4..7d051607e 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -782,6 +782,10 @@ def launch_test(test_case, skip_list=None, exe_list=None): # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf' "test_dtypes_cosh_xpu", + # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. + # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. + "test_dtypes_polar_xpu", + # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support # but test_dtypes infrastructure leverage CUDA supported datatypes "test_dtypes_histogram_xpu", @@ -3016,8 +3020,12 @@ def launch_test(test_case, skip_list=None, exe_list=None): res += launch_test("nn/test_load_state_dict_xpu.py") # test_module_hooks - -res += launch_test("nn/test_module_hooks_xpu.py") +skip_list = ( + # TypeError: TestStateDictHooks.test_register_state_dict_post_hook() missing 1 required positional argument: 'private' + # https://github.com/intel/torch-xpu-ops/issues/658 + "test_register_state_dict_post_hook", +) +res += launch_test("nn/test_module_hooks_xpu.py", skip_list) # test_parametrization diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index c281747f2..823988488 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -208,6 +208,7 @@ "unique", "multinomial", "lerp", + "polar", "frac", "aminmax", "argmin", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index fd087c7bc..9d453d215 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -268,6 +268,7 @@ supported: - eye.m_out - _efficientzerotensor - complex.out + - polar.out - clone - fill_.Scalar - fill_.Tensor From 82268376d9f215f5ca0988264608ead45ae028f9 Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 30 Jul 2024 13:19:00 +0800 Subject: [PATCH 15/17] Enable weekly test (#637) 1. enable weekly test contains 3 suites full e2e and full ut 2. always() to not cancelled() --- .github/workflows/nightly_ondemand.yml | 59 +++++++++++++++++++++----- .github/workflows/pull.yml | 4 +- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 1a663661e..e5be18fb5 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -2,8 +2,10 @@ name: Nightly-OnDemand Tests on: schedule: - # GMT+8 21:00 every day - - cron: '0 13 * * *' + # GMT+8 21:00 every workday + - cron: '0 13 * * 0-4' + # GMT+8 0:00 Saturday + - cron: '0 16 * * 5' workflow_dispatch: inputs: pytorch: @@ -78,7 +80,7 @@ jobs: runs-on: pvc_e2e # Don't run on forked repos if: github.repository_owner == 'intel' - timeout-minutes: 900 + timeout-minutes: 3600 env: pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }} keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }} @@ -174,8 +176,10 @@ jobs: echo "$GITHUB_ENV" rm -rf ../pytorch/inductor_log rm -rf /tmp/torchinductor_* + + # Nihglty launch - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test - if: github.event_name == 'schedule' + if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4' uses: ./.github/actions/inductor-xpu-e2e-test with: suite: huggingface @@ -185,7 +189,7 @@ jobs: scenario: accuracy hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: Nightly Torchbench BF16 Training Accuracy Test - if: github.event_name == 'schedule' + if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4' uses: ./.github/actions/inductor-xpu-e2e-test with: suite: torchbench @@ -195,7 +199,7 @@ jobs: env_prepare: true hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: Nightly Timm_models FP16 Training Accuracy Test - if: github.event_name == 'schedule' + if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4' uses: ./.github/actions/inductor-xpu-e2e-test with: suite: timm_models @@ -204,6 +208,38 @@ jobs: scenario: accuracy env_prepare: true hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + # Weekly launch + - name: Weekly Huggingface Full Test + if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5' + uses: ./.github/actions/inductor-xpu-e2e-test + with: + suite: huggingface + env_prepare: true + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + - name: Weekly Torchbench Full Test + if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5' + uses: ./.github/actions/inductor-xpu-e2e-test + with: + suite: torchbench + env_prepare: true + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + - name: Weekly Timm_models Full Test + if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5' + uses: ./.github/actions/inductor-xpu-e2e-test + with: + suite: timm_models + env_prepare: true + dt: float32,bfloat16,float16,amp_bf16,amp_fp16 + mode: inference,training + scenario: accuracy,performance + hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + # On-demand launch - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }}) if: github.event_name != 'schedule' uses: ./.github/actions/inductor-xpu-e2e-test @@ -216,7 +252,7 @@ jobs: hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: Summarize archieve files id: summary - if: always() + if: ${{ ! cancelled() }} run: | rm -rf ${{ github.workspace }}/upload_files cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files @@ -237,14 +273,14 @@ jobs: exit 1 fi - name: Upload Inductor XPU E2E Data - if: always() + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} path: ${{ github.workspace }}/upload_files - + Tests-Failure-And-Report: - if: always() + if: ${{ ! cancelled() }} runs-on: pvc_e2e permissions: issues: write @@ -288,6 +324,9 @@ jobs: test_type="On-demand" test_issue_id=426 cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}" + elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then + test_type="Weekly" + test_issue_id=432 else test_type="Nightly" test_issue_id=432 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5ee55fdcf..350c88f91 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,7 +127,7 @@ jobs: env_prepare: true hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - name: Summarize archieve files - if: always() + if: ${{ ! cancelled() }} run: | rm -rf ${{ github.workspace }}/upload_files cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files @@ -137,7 +137,7 @@ jobs: exit 1 fi - name: Upload Inductor XPU E2E Data - if: always() + if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 with: name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} From 67116b3c359c619f0fdb9b0266f4fbd6607cc956 Mon Sep 17 00:00:00 2001 From: Yutao Xu Date: Tue, 30 Jul 2024 13:55:12 +0800 Subject: [PATCH 16/17] Add aten::masked_scatter_ (#652) Add aten::masked_scatter_. --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Indexing.cpp | 30 +++++++ src/ATen/native/xpu/XPUFallback.template | 1 - src/ATen/native/xpu/sycl/Indexing.cpp | 90 +++++++++++++++++++ src/ATen/native/xpu/sycl/IndexingKernels.h | 6 ++ src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h | 13 ++- test/xpu/test_torch_xpu.py | 4 +- test/xpu/xpu_test_utils.py | 1 + yaml/xpu_functions.yaml | 1 + 8 files changed, 136 insertions(+), 10 deletions(-) diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp index 5db9a7238..d4d5598e6 100644 --- a/src/ATen/native/xpu/Indexing.cpp +++ b/src/ATen/native/xpu/Indexing.cpp @@ -45,6 +45,36 @@ Tensor XPUNativeFunctions::index_select( return index_select_out(self, dim, index, out); } +Tensor& XPUNativeFunctions::masked_scatter_( + Tensor& self, + const Tensor& mask, + const Tensor& source) { + at::assert_no_internal_overlap(self); + TORCH_CHECK( + self.scalar_type() == source.scalar_type(), + "masked_scatter_: expected self and source to have same dtypes but got ", + self.scalar_type(), + " and ", + source.scalar_type()); + TORCH_CHECK( + mask.dtype() == ScalarType::Bool, + "masked_scatter_ only supports boolean masks, " + "but got mask with dtype ", + mask.dtype()); + + c10::MaybeOwned b_mask = + expand_inplace(self, mask, "masked_scatter_"); + + if (self.numel() == 0) { + return self; + } + + auto maskPrefixSum = at::empty(self.sizes(), mask.options().dtype(kLong)); + native::xpu::masked_scatter_kernel(self, *b_mask, maskPrefixSum, source); + + return self; +} + static Tensor& masked_select_out_impl( Tensor& result, const Tensor& self, diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 4a4c96828..23b79d6dd 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -224,7 +224,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "log_normal_", "logspace.out", "lu_unpack.out", - "masked_scatter_", "max_pool3d_with_indices", "max_pool3d_with_indices_backward", "max_unpool2d", diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp index 312f4ce12..fcee372d1 100644 --- a/src/ATen/native/xpu/sycl/Indexing.cpp +++ b/src/ATen/native/xpu/sycl/Indexing.cpp @@ -597,6 +597,7 @@ void index_put_deterministic_kernel( if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) { auto expanded_size = at::DimVector(expandedValue.sizes()); + auto size1 = expandedValue.sizes(); auto size2 = linearIndex.sizes(); if (are_expandable(size1, size2)) { @@ -667,6 +668,95 @@ void index_put_deterministic_kernel( } } +template +struct MaskedScatterElementwiseFunctor { + scalar_t operator()( + const scalar_t a, + const bool mask, + const int64_t maskPrefixSum) const { + if (mask) { + return source_ptr_[maskPrefixSum]; + } + return a; + } + MaskedScatterElementwiseFunctor(const scalar_t* source_ptr) + : source_ptr_(source_ptr) {} + + private: + const scalar_t* source_ptr_; +}; + +struct MaskedScatterSizeCheckFunctor { + void operator()(sycl::nd_item<1> item) const { + const auto totalElements = *mask_exclusive_sum_ + *mask_; + SYCL_KERNEL_ASSERT(totalElements <= srcSize_); + } + MaskedScatterSizeCheckFunctor( + const int64_t* const mask_exclusive_sum, + const bool* const mask, + const int64_t srcSize) + : mask_exclusive_sum_(mask_exclusive_sum), + mask_(mask), + srcSize_(srcSize) {} + + private: + const int64_t* const mask_exclusive_sum_; + const bool* const mask_; + const int64_t srcSize_; +}; + +void masked_scatter_kernel( + const TensorBase& self, + const TensorBase& mask, + const TensorBase& maskPrefixSum, + const TensorBase& source) { + const auto srcSize = source.numel(); + const auto mask_cont = mask.contiguous(); + const auto mask_numel = mask.numel(); + + // Use a prefix sum to determine the output locations of the masked elements + auto maskPrefixSum_data = maskPrefixSum.mutable_data_ptr(); + auto mask_data = mask_cont.const_data_ptr(); + + pstl::exclusive_scan( + mask_data, + mask_data + mask_numel, + maskPrefixSum_data, + static_cast(0)); + + // Asynchronously check that the number of `1` elements present in the mask + // must be <= the number of elements available in `src`. + auto caller = MaskedScatterSizeCheckFunctor( + &maskPrefixSum_data[mask_numel - 1], &mask_data[mask_numel - 1], srcSize); + sycl_kernel_submit((size_t)1, (size_t)1, getCurrentSYCLQueue(), caller); + + // We are getting elements from `src` based on an offset from + // `maskPrefixSum`, so that should be made contiguous too + auto source_contig = source.contiguous(); + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(self) + .add_input(self) + .add_const_input(mask_cont) + .add_input(maskPrefixSum) + .build(); + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + ScalarType::Bool, + ScalarType::BFloat16, + ScalarType::Half, + self.scalar_type(), + "masked_scatter_", + [&]() { + auto source_ptr = source_contig.const_data_ptr(); + gpu_kernel(iter, MaskedScatterElementwiseFunctor(source_ptr)); + }); +} + } // namespace at::native::xpu + #pragma GCC diagnostic pop #pragma clang diagnostic pop diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h index cde537e73..8f32f49f9 100644 --- a/src/ATen/native/xpu/sycl/IndexingKernels.h +++ b/src/ATen/native/xpu/sycl/IndexingKernels.h @@ -47,4 +47,10 @@ void index_put_deterministic_kernel( bool accumulate, bool unsafe); +void masked_scatter_kernel( + const TensorBase& self, + const TensorBase& mask, + const TensorBase& maskPrefixSum, + const TensorBase& source); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h index 66c316e25..efc9f164e 100644 --- a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h +++ b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h @@ -1,11 +1,10 @@ #pragma once #include -#include - #include #include #include +#include #include #include #include @@ -23,10 +22,10 @@ struct KSScanKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { // initialize local_input auto cur_init = init_; if (scan_type == 1) { - local_scan_[local_id] = first_[local_id]; + local_scan_[local_id] = c10::load(&first_[local_id]); } else { if (local_id > 0) - local_scan_[local_id] = first_[local_id - 1]; + local_scan_[local_id] = c10::load(&first_[local_id - 1]); else local_scan_[local_id] = 0; } @@ -72,17 +71,17 @@ struct KSScanWithCarrierKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { auto cur_init = (group_id == 0 ? init_ : 0); if (global_id < N_) { if (scan_type == 1) { - local_scan_[local_id] = first_[global_id]; + local_scan_[local_id] = c10::load(&first_[global_id]); } else { if (local_id > 0) - local_scan_[local_id] = first_[global_id - 1]; + local_scan_[local_id] = c10::load(&first_[global_id - 1]); else local_scan_[local_id] = 0; } if (local_id == 0) local_scan_[local_id] += cur_init; if (local_id == wgroup_size_ - 1) { - carry_ptr_[group_id] = first_[global_id]; + carry_ptr_[group_id] = c10::load(&first_[global_id]); } } item_id.barrier(sycl_local_fence); diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py index b82a8ec67..80fb3c8b0 100644 --- a/test/xpu/test_torch_xpu.py +++ b/test/xpu/test_torch_xpu.py @@ -3995,11 +3995,11 @@ def test_masked_scatter(self, device, dtype): dest_ones.masked_scatter_(mask, src_ones) self.assertEqual(dest_ones, dest_ones_expected, atol=0, rtol=0) - # Bound checking in CUDA is done inside a kernel + # Bound checking in GPU is done inside a kernel # in order to avoid synchronization, but this means # we can not clear the failures. So there is no way # to test it then recover. - if self.device_type != 'cuda' or self.device_type != 'xpu': + if self.device_type != 'cuda' and self.device_type != 'xpu': # make src smaller. this should fail src = torch.zeros(num_copy - 1, dtype=dt, device=device) with self.assertRaises(RuntimeError): diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 823988488..635b1f1b6 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -80,6 +80,7 @@ "index_fill", "index_put", "index_select", + "masked_scatter", "masked_select", "isin", "isnan", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 9d453d215..5c354e0b0 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -105,6 +105,7 @@ supported: - isnan.out - masked_fill_.Tensor - masked_fill_.Scalar + - masked_scatter_ - index_add.out - index_add_ - index_add From 8a821bf6aaa72f733ba65f6b2cbef40983bd71a9 Mon Sep 17 00:00:00 2001 From: "Huaiyu, Zheng" Date: Tue, 30 Jul 2024 14:37:25 +0800 Subject: [PATCH 17/17] Add aten::_weight_norm_interface and aten::_weight_norm_interface_backward (#607) add aten::_weight_norm_interface and aten::_weight_norm_interface_backward --- src/ATen/native/xpu/WeightNorm.cpp | 27 + .../xpu/sycl/AdaptiveMaxPooling2dKernels.cpp | 8 +- src/ATen/native/xpu/sycl/BatchKernel.h | 237 ++-- src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp | 13 +- src/ATen/native/xpu/sycl/EmbeddingBag.cpp | 5 +- src/ATen/native/xpu/sycl/Indexing.h | 14 +- .../native/xpu/sycl/WeightNormKernels.cpp | 1027 +++++++++++++++++ src/ATen/native/xpu/sycl/WeightNormKernels.h | 16 + yaml/xpu_functions.yaml | 2 + 9 files changed, 1234 insertions(+), 115 deletions(-) create mode 100644 src/ATen/native/xpu/WeightNorm.cpp create mode 100644 src/ATen/native/xpu/sycl/WeightNormKernels.cpp create mode 100644 src/ATen/native/xpu/sycl/WeightNormKernels.h diff --git a/src/ATen/native/xpu/WeightNorm.cpp b/src/ATen/native/xpu/WeightNorm.cpp new file mode 100644 index 000000000..7fec9ecfe --- /dev/null +++ b/src/ATen/native/xpu/WeightNorm.cpp @@ -0,0 +1,27 @@ +#include +#include +namespace at { +std::tuple XPUNativeFunctions::_weight_norm_interface( + const Tensor& v, + const Tensor& g, + int64_t dim) { + return native::xpu::weight_norm_kernel(v, g, dim); +} + +std::tuple XPUNativeFunctions::_weight_norm_interface_backward( + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norms, + int64_t dim) { + TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); + TORCH_CHECK( + dim == 0 || dim == saved_v.dim() - 1, + "fused kernels can only be applied for first or last dim") + + return native::xpu::weight_norm_backward_kernel( + grad_w, saved_v, saved_g, saved_norms, dim); +} +} // namespace at \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp index fe50d1a6c..9da231b0d 100644 --- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp +++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp @@ -124,8 +124,8 @@ void launch_adaptive_max_pool2d_kernel( using KernelClass = AdaptiveMaxPool2dKernelFunctor; int64_t output_size = batch * plane * osizeH * osizeW; - BatchKernelConfig cfg = { - 1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive}; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + 1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive); cfg.build(); @@ -301,8 +301,8 @@ void launch_adaptive_max_pool2d_backward_kernel( int64_t sizeP) { using KernelClass = AdaptiveMaxPool2dBackwardKernelFunctor; - BatchKernelConfig cfg = { - 1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive}; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + 1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive); cfg.build(); diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h index cff967a76..bc0da3bb5 100644 --- a/src/ATen/native/xpu/sycl/BatchKernel.h +++ b/src/ATen/native/xpu/sycl/BatchKernel.h @@ -25,6 +25,146 @@ class BatchKernelConfig { } public: + template + static BatchKernelConfig make_config( + int64_t batch, + int64_t problem, + int64_t stride, + int64_t problem_batch, + bool problem_along_x, + Policy policy = Policy::pSegment, + int64_t prefer_wg_size = 0) { + BatchKernelConfig cfg = { + batch, + problem, + stride, + problem_batch, + problem_along_x, + policy, + prefer_wg_size}; + cfg.template build(); + + return cfg; + } + + template + static BatchKernelConfig make_config( + int64_t batch, + int64_t problem, + int64_t stride, + int64_t problem_batch, + bool problem_along_x, + std::vector policies, + int64_t prefer_wg_size = 0) { + BatchKernelConfig cfg = { + batch, + problem, + stride, + problem_batch, + problem_along_x, + policies, + prefer_wg_size}; + cfg.template build(); + + return cfg; + } + + sycl::range<2> global_size() const { + return {glb_range_y_, glb_range_x_}; + } + + sycl::range<2> group_size() const { + return {wg_range_y_, wg_range_x_}; + } + + struct ItemDesc { + /* chunk id along problem dim */ size_t chunk; + /* problem chunk size */ size_t chunk_size; + /* offsite in current chunk */ size_t chunk_off; + /* how many active chunks along problem dim */ size_t chunk_num; + /* global batch id */ size_t glb_batch; + /* global problem id */ size_t glb_problem; + }; + + ItemDesc get_item_desc(sycl::nd_item<2> item) const { + auto lix = item.get_local_id(1); + auto liy = item.get_local_id(0); + auto lrx = item.get_local_range(1); + auto lry = item.get_local_range(0); + auto wgrx = item.get_group_range(1); + auto wgry = item.get_group_range(0); + auto gix = item.get_global_id(1); + auto giy = item.get_global_id(0); + auto gx = item.get_group(1); + auto gy = item.get_group(0); + + // ItemDesc::glb_problem is meaningless, if policy is loop for all. + if (problem_along_x_) { + return {gx, lrx, lix, wgrx, giy, gix}; + } else { + return {gy, lry, liy, wgry, gix, giy}; + } + } + + // iterate over problems and batchs for `pAdaptive` policy + // # update workload status inplace in `desc`. + // # prioritize problem iteration. + bool next(sycl::nd_item<2> item, ItemDesc& desc) const { + auto next_problem = desc.glb_problem + problem_glb_range_; + auto next_batch = desc.glb_batch + batch_glb_range_; + auto cur_chunk = desc.chunk; + + // WA: break deduce chain, or offline compiler gets crash, due to, + // massive and deep divergence level + desc = get_item_desc(item); + + // iterate over problem + if (next_problem < problem_range_) { + desc.glb_problem = next_problem; + desc.chunk = cur_chunk + desc.chunk_num; + return true; + } + + // iterate over batch + if (next_batch < batch_range_) { + desc.glb_batch = next_batch; + return true; + } + + return false; + } + + static Policy suggest_policy( + int64_t batch, + int64_t problem, + int64_t stride, + bool problem_along_x, + bool bypass_adaptive_policy = true) { + auto target_wi_num = syclMaxWorkItemsPerTile(); + + if (!bypass_adaptive_policy && batch * problem * stride >= target_wi_num) { + return Policy::pAdaptive; + } + // Using device max work group size to deduce range configuration + // approximately. + BatchKernelConfig cfg_ = make_config( + batch, + problem, + stride, + batch * stride, + problem_along_x, + Policy::pLoop, + syclDeviceMaxWorkGroupSize()); + size_t wg_num = (cfg_.glb_range_x_ / cfg_.wg_range_x_) * + (cfg_.glb_range_y_ / cfg_.wg_range_y_); + size_t wg_size = cfg_.wg_range_x_ * cfg_.wg_range_y_; + if (wg_size * (wg_num + 1) > target_wi_num) { + return Policy::pLoop; + } + + return Policy::pSegment; + } + BatchKernelConfig( int64_t batch, int64_t problem, @@ -52,11 +192,14 @@ class BatchKernelConfig { template void build() { - size_t wg_size = syclMaxWorkGroupSize(); + size_t wg_size; size_t sg_size = syclMaxSubGroupSize(); + // Caller takes responsibility of if work group size is valid or compatible. if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 && - prefer_wg_size_ < wg_size) { + prefer_wg_size_ <= syclDeviceMaxWorkGroupSize()) { wg_size = prefer_wg_size_; + } else { + wg_size = syclMaxWorkGroupSize(); } wg_range_x_ = sg_size; wg_range_y_ = wg_size / wg_range_x_; @@ -170,96 +313,6 @@ class BatchKernelConfig { }(), prefer_wg_size) {} - sycl::range<2> global_size() const { - return {glb_range_y_, glb_range_x_}; - } - - sycl::range<2> group_size() const { - return {wg_range_y_, wg_range_x_}; - } - - struct ItemDesc { - /* chunk id along problem dim */ size_t chunk; - /* problem chunk size */ size_t chunk_size; - /* offsite in current chunk */ size_t chunk_off; - /* how many active chunks along problem dim */ size_t chunk_num; - /* global batch id */ size_t glb_batch; - /* global problem id */ size_t glb_problem; - }; - - ItemDesc get_item_desc(sycl::nd_item<2> item) const { - auto lix = item.get_local_id(1); - auto liy = item.get_local_id(0); - auto lrx = item.get_local_range(1); - auto lry = item.get_local_range(0); - auto wgrx = item.get_group_range(1); - auto wgry = item.get_group_range(0); - auto gix = item.get_global_id(1); - auto giy = item.get_global_id(0); - auto gx = item.get_group(1); - auto gy = item.get_group(0); - - // ItemDesc::glb_problem is meaningless, if policy is loop for all. - if (problem_along_x_) { - return {gx, lrx, lix, wgrx, giy, gix}; - } else { - return {gy, lry, liy, wgry, gix, giy}; - } - } - - // iterate over problems and batchs for `pAdaptive` policy - // # update workload status inplace in `desc`. - // # prioritize problem iteration. - bool next(sycl::nd_item<2> item, ItemDesc& desc) const { - auto next_problem = desc.glb_problem + problem_glb_range_; - auto next_batch = desc.glb_batch + batch_glb_range_; - auto cur_chunk = desc.chunk; - - // WA: break deduce chain, or offline compiler gets crash, due to, - // massive and deep divergence level - desc = get_item_desc(item); - - // iterate over problem - if (next_problem < problem_range_) { - desc.glb_problem = next_problem; - desc.chunk = cur_chunk + desc.chunk_num; - return true; - } - - // iterate over batch - if (next_batch < batch_range_) { - desc.glb_batch = next_batch; - return true; - } - - return false; - } - - static Policy suggest_policy( - int64_t batch, - int64_t problem, - int64_t stride, - bool problem_along_x, - bool bypass_adaptive_policy = true) { - auto target_wi_num = syclMaxWorkItemsPerTile(); - - if (!bypass_adaptive_policy && batch * problem * stride >= target_wi_num) { - return Policy::pAdaptive; - } - - BatchKernelConfig cfg_ = { - batch, problem, stride, batch * stride, problem_along_x, Policy::pLoop}; - size_t wg_num = (cfg_.glb_range_x_ / cfg_.wg_range_x_) * - (cfg_.glb_range_y_ / cfg_.wg_range_y_); - size_t wg_size = cfg_.wg_range_x_ * cfg_.wg_range_y_; - - if (wg_size * (wg_num + 1) > target_wi_num) { - return Policy::pLoop; - } - - return Policy::pSegment; - } - public: /* logical shape desc */ int64_t batch_; /* logical shape desc */ int64_t problem_; diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp index 7fd51ddf3..eab6f4c48 100644 --- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp +++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp @@ -371,9 +371,8 @@ void launch_max_pool2d_kernel( auto& queue = at::xpu::getCurrentSYCLQueue(); int outputSize = numBatch * numPlane * outputSizeH * outputSizeW; int stride = numPlane * outputSizeH * outputSizeW; - BatchKernelConfig cfg = { - 1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive}; - cfg.template build(); + BatchKernelConfig cfg = BatchKernelConfig::make_config( + 1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive); auto kfn = KernelClass( output, indices, @@ -436,8 +435,8 @@ void launch_max_pool2d_backward_kernel( using KernelClass = MaxPool2dBackwardDeterministicKernelFunctor; - BatchKernelConfig cfg = { - 1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive}; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + 1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive); cfg.template build(); auto kfn = KernelClass( gradInput, @@ -468,8 +467,8 @@ void launch_max_pool2d_backward_kernel( numBatch * numPlane * gradOutputSizeH * gradOutputSizeW; using KernelClass = MaxPool2dBackwardKernelFunctor; - BatchKernelConfig cfg = { - 1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive}; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + 1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive); cfg.template build(); auto kfn = KernelClass( gradInput, diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp index 5b9e4fa6b..0c7338d5d 100644 --- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp +++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp @@ -57,9 +57,8 @@ void embedding_bag( vec_idx_t* max_idx_vec = reinterpret_cast(max_index); vec_len = vec_len / vec_size; - BatchKernelConfig cfg = { - bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive}; - cfg.template build(); + BatchKernelConfig cfg = BatchKernelConfig::make_config( + bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive); index_t fixing_bag_size = ignore_offsets ? index_size / bag_num : 0; auto kfn = KernelClass( diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h index d78ddfbf6..fcb429ef5 100644 --- a/src/ATen/native/xpu/sycl/Indexing.h +++ b/src/ATen/native/xpu/sycl/Indexing.h @@ -866,21 +866,17 @@ void launch_index_put_deterministic_kernel( return; } int64_t v_stride_before = numel * stride; - BatchKernelConfig cfg = { + // align with precision of CPU backend. + using accscalar_t = scalar_t; /* acc_type; */ + using KernelClass = IndexPutDeterministicKernelFunctor; + BatchKernelConfig cfg = BatchKernelConfig::make_config( /* num of indices */ numel, /* num of elements to put per indices */ outer_dim * stride, 1, numel, true, {BatchKernelConfig::Policy::pSegment, - BatchKernelConfig::Policy::pAggressiveSplit}}; - - // align with precision of CPU backend. - using accscalar_t = scalar_t; /* acc_type; */ - using KernelClass = IndexPutDeterministicKernelFunctor; - - cfg.template build(); - + BatchKernelConfig::Policy::pAggressiveSplit}); KernelClass kfn( sorted_indices, indices, diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp new file mode 100644 index 000000000..dd93f68c3 --- /dev/null +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp @@ -0,0 +1,1027 @@ +#include +#include +#include +#include +#include +#include +#include +#include "comm/Runtime.h" + +namespace at::native::xpu { + +template +struct ReduceAdd { + T operator()(const T a, const T b) const { + return a + b; + } +}; + +template < + class ScalarTypeInfo, + class AccTypeInfo, + typename scalar_t, + typename accscalar_t, + typename vec_t> +struct WeightNormReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t ldr_pi = id.chunk * id.chunk_size + id.chunk_off; + int64_t str_pi = id.chunk; + int64_t ldr_lid = + si + ldr_pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_; + int64_t ldr_off = at::xpu::detail::IndexToOffset::get( + ldr_lid, + iinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t str_lid = + si + str_pi * cfg_.stride_ + bi * id.chunk_num * cfg_.stride_; + int64_t str_off = at::xpu::detail::IndexToOffset::get( + str_lid, + oinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + accscalar_t value = 0; + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + value = (accscalar_t)iinfo_.data[ldr_off]; + if (need_squre_) + value *= value; + } + + if (cfg_.problem_along_x_) { + value = group_x_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } else { + value = group_y_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } + + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + if (id.chunk_off == 0) { + oinfo_.data[str_off] = is_final_ ? sqrtf(value) : value; + } + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(shared_memeory_size_, cgh); + } + WeightNormReduceKernelFunctor( + ScalarTypeInfo iinfo, + AccTypeInfo oinfo, + BatchKernelConfig cfg, + bool need_squre, + bool is_final, + int64_t shared_memeory_size) + : iinfo_(iinfo), + oinfo_(oinfo), + cfg_(cfg), + need_squre_(need_squre), + is_final_(is_final), + shared_memeory_size_(shared_memeory_size) {} + + private: + ScalarTypeInfo iinfo_; + AccTypeInfo oinfo_; + BatchKernelConfig cfg_; + bool need_squre_; + bool is_final_; + int64_t shared_memeory_size_; + sycl_local_acc_t shared_; +}; + +template +static inline void launch_weight_norm_reduce_kernel( + ScalarTypeInfo& iinfo, + AccTypeInfo& oinfo, + BatchKernelConfig& cfg, + bool need_squre, + bool is_final) { + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + + WeightNormReduceKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t, + vec_t> + kfn(iinfo, oinfo, cfg, need_squre, is_final, cfg.group_size().size()); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); +} + +template +static inline void weight_norm_reduce( + ScalarTypeInfo& vinfo, + AccTypeInfo& ninfo, + int dim_after_collapse, + bool need_square) { + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + using KernelClass = WeightNormReduceKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t, + vec_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, problem, stride, batch * stride, problem_along_x); + + if (cfg.problem_ <= cfg.problem_wg_range_) { + launch_weight_norm_reduce_kernel(vinfo, ninfo, cfg, need_square, true); + return; + } + + Tensor carrier = at::empty( + {cfg.batch_, cfg.problem_glb_range_ / cfg.problem_wg_range_, cfg.stride_}, + map_options()); + auto cinfo = + at::xpu::detail::getTensorInfo( + carrier); + launch_weight_norm_reduce_kernel(vinfo, cinfo, cfg, need_square, false); + + weight_norm_reduce(cinfo, ninfo, 1, false); + return; +} + +template < + class ScalarTypeInfo, + class AccTypeInfo, + typename scalar_t, + typename accscalar_t> +struct SegmentWeightNormKernelFunctor { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t pi = id.chunk * id.chunk_size + id.chunk_off; + int64_t w_lid = si + pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_; + int64_t n_lid = id.glb_batch; + + int64_t v_off = at::xpu::detail::IndexToOffset::get( + w_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t w_off = at::xpu::detail::IndexToOffset::get( + w_lid, + winfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t g_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t n_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ninfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + winfo_.data[w_off] = + (1.f / ninfo_.data[n_off]) * vinfo_.data[v_off] * ginfo_.data[g_off]; + } + } + SegmentWeightNormKernelFunctor( + ScalarTypeInfo vinfo, + ScalarTypeInfo ginfo, + ScalarTypeInfo winfo, + AccTypeInfo ninfo, + BatchKernelConfig cfg) + : vinfo_(vinfo), ginfo_(ginfo), winfo_(winfo), ninfo_(ninfo), cfg_(cfg) {} + + private: + ScalarTypeInfo vinfo_; + ScalarTypeInfo ginfo_; + ScalarTypeInfo winfo_; + AccTypeInfo ninfo_; + BatchKernelConfig cfg_; +}; + +template +static inline void segment_weight_norm( + ScalarTypeInfo& vinfo, + ScalarTypeInfo& ginfo, + ScalarTypeInfo& winfo, + AccTypeInfo& ninfo, + int dim_after_collapse) { + // segment reduce for statistics + weight_norm_reduce(vinfo, ninfo, dim_after_collapse, true); + + // normalization + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + + using KernelClass = SegmentWeightNormKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, problem, stride, batch * stride, problem_along_x); + + KernelClass kfn(vinfo, ginfo, winfo, ninfo, cfg); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); +} + +template < + class ScalarTypeInfo, + class AccTypeInfo, + typename scalar_t, + typename accscalar_t, + typename vec_t> +struct WeightNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + int64_t n_lid = id.glb_batch; + + int64_t g_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t n_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ninfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t pi = id.chunk_off; + bi = si + bi * cfg_.problem_ * cfg_.stride_; + + accscalar_t value = 0; + if (id.glb_batch < cfg_.problem_batch_) { + for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) { + int64_t v_lid = bi + pi_ * cfg_.stride_; + int64_t v_off = at::xpu::detail::IndexToOffset::get( + v_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + accscalar_t v = (accscalar_t)vinfo_.data[v_off]; + value += v * v; + } + } + + if (cfg_.problem_along_x_) { + value = group_x_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } else { + value = group_y_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } + + int n_slid = (int)id.glb_batch % batch_wg_range_; + if (id.glb_batch < cfg_.problem_batch_ && id.chunk_off == 0) { + value = sqrtf(value); + ninfo_.data[n_off] = value; + shared_[n_slid] = value; + } + // Here using slm instead. If using ugm, need fence w/ + // order:acq_rel & scope:workgroup & space:global_mem. + item.barrier(sycl_local_fence); + + if (id.glb_batch < cfg_.problem_batch_) { + for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) { + int64_t v_lid = bi + pi_ * cfg_.stride_; + int64_t v_off = at::xpu::detail::IndexToOffset::get( + v_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t w_off = at::xpu::detail::IndexToOffset::get( + v_lid, + winfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + winfo_.data[w_off] = + (1.f / shared_[n_slid]) * vinfo_.data[v_off] * ginfo_.data[g_off]; + } + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(wg_size_, cgh); + } + + WeightNormKernelFunctor( + ScalarTypeInfo vinfo, + ScalarTypeInfo ginfo, + ScalarTypeInfo winfo, + AccTypeInfo ninfo, + BatchKernelConfig cfg, + int wg_size, + int batch_wg_range) + : vinfo_(vinfo), + ginfo_(ginfo), + winfo_(winfo), + ninfo_(ninfo), + cfg_(cfg), + wg_size_(wg_size), + batch_wg_range_(batch_wg_range) {} + + private: + ScalarTypeInfo vinfo_; + ScalarTypeInfo ginfo_; + ScalarTypeInfo winfo_; + AccTypeInfo ninfo_; + BatchKernelConfig cfg_; + int wg_size_; + int batch_wg_range_; + sycl_local_acc_t shared_; +}; + +template +static inline void weight_norm( + ScalarTypeInfo& vinfo, + ScalarTypeInfo& ginfo, + ScalarTypeInfo& winfo, + AccTypeInfo& ninfo, + int dim_after_collapse) { + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + + using KernelClass = WeightNormKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t, + vec_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, + problem, + stride, + batch * stride, + problem_along_x, + BatchKernelConfig::Policy::pLoop); + + int wg_size = cfg.group_size().size(); + int batch_wg_range = wg_size / cfg.problem_wg_range_; + KernelClass kfn(vinfo, ginfo, winfo, ninfo, cfg, wg_size, batch_wg_range); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); + + return; +} + +std::tuple weight_norm_kernel( + const Tensor& v, + const Tensor& g, + int64_t dim) { + TORCH_INTERNAL_ASSERT( + dim == 0 || dim == v.dim() - 1, + "fused kernels can only be applied for first or last dim"); + + at::ScalarType scalar_acc_t = (g.scalar_type() == at::ScalarType::Half || + g.scalar_type() == at::ScalarType::BFloat16) + ? at::ScalarType::Float + : g.scalar_type(); + auto norms = at::empty( + g.sizes(), g.options().dtype(scalar_acc_t), g.suggest_memory_format()); + auto w = at::empty(v.sizes(), v.options(), v.suggest_memory_format()); + + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + v.scalar_type(), + "aten::weight_norm", + [&] { + auto vinfo = at::xpu::detail::getTensorInfo(v); + int dim_after_collapse = vinfo.collapseDims(dim); + auto ginfo = at::xpu::detail::getTensorInfo(g); + ginfo.collapseDims(); + + auto winfo = at::xpu::detail::getTensorInfo(w); + winfo.collapseDims(dim); + using accscalar_t = acc_type; + auto ninfo = + at::xpu::detail::getTensorInfo(norms); + ninfo.collapseDims(); + dim_after_collapse = 1 - dim_after_collapse; // remain dim + + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = + vinfo.strides[dim_after_collapse] == 1 ? true : false; + if (BatchKernelConfig::Policy::pSegment == + BatchKernelConfig::suggest_policy( + batch, problem, stride, problem_along_x)) { + segment_weight_norm(vinfo, ginfo, winfo, ninfo, dim_after_collapse); + } else { + weight_norm(vinfo, ginfo, winfo, ninfo, dim_after_collapse); + } + }); + + return {w, norms}; +} + +template < + bool is_first, + class ScalarType1Info, + class ScalarType2Info, + class AccTypeInfo, + typename scalar1_t, + typename scalar2_t, + typename accscalar_t, + typename vec_t> +struct WeightNormBackwardReduceKernelFunctor + : public __SYCL_KER_CONFIG_CONVENTION__ { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t i_pi = id.chunk * id.chunk_size + id.chunk_off; + int64_t o_pi = id.chunk; + + int64_t i_lid = + si + i_pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_; + int64_t i1_off = at::xpu::detail::IndexToOffset::get( + i_lid, + i1info_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t i2_off; + if (is_first) { + i2_off = at::xpu::detail::IndexToOffset::get( + i_lid, + i2info_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + } + + int64_t o_lid = si + o_pi * cfg_.stride_ + bi * id.chunk_num * cfg_.stride_; + int64_t o_off = at::xpu::detail::IndexToOffset::get( + o_lid, + oinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + accscalar_t value = 0; + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + if (is_first) { + auto value1 = (accscalar_t)i1info_.data[i1_off]; + auto value2 = (accscalar_t)i2info_.data[i2_off]; + value = value1 * value2; + } else { + value = (accscalar_t)i1info_.data[i1_off]; + } + } + + if (cfg_.problem_along_x_) { + value = group_x_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } else { + value = group_y_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } + + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + if (id.chunk_off == 0) { + oinfo_.data[o_off] = value; + } + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(local_size_, cgh); + } + WeightNormBackwardReduceKernelFunctor( + ScalarType1Info i1info, + ScalarType2Info i2info, + AccTypeInfo oinfo, + BatchKernelConfig cfg, + int64_t local_size) + : i1info_(i1info), + i2info_(i2info), + oinfo_(oinfo), + cfg_(cfg), + local_size_(local_size) {} + + private: + ScalarType1Info i1info_; + ScalarType2Info i2info_; + AccTypeInfo oinfo_; + BatchKernelConfig cfg_; + int64_t local_size_; + sycl_local_acc_t shared_; +}; + +template < + bool is_first, + class ScalarType1Info, + class ScalarType2Info, + class AccTypeInfo> +static inline void launch_weight_norm_backward_reduce_kernel( + ScalarType1Info& i1info, + ScalarType2Info& i2info, + AccTypeInfo& oinfo, + BatchKernelConfig& cfg) { + using scalar1_t = typename ScalarType1Info::scalar_t; + using scalar2_t = typename ScalarType2Info::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + WeightNormBackwardReduceKernelFunctor< + is_first, + ScalarType1Info, + ScalarType2Info, + AccTypeInfo, + scalar1_t, + scalar2_t, + accscalar_t, + vec_t> + kfn(i1info, i2info, oinfo, cfg, cfg.group_size().size()); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); +} + +template +static inline void weight_norm_backward_reduce( + ScalarType1Info& vinfo, + ScalarType2Info& gwinfo, + AccTypeInfo& rinfo, + int dim_after_collapse, + bool is_first) { + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + + using scalar1_t = typename ScalarType1Info::scalar_t; + using scalar2_t = typename ScalarType2Info::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + using KernelClass = WeightNormBackwardReduceKernelFunctor< + true, + ScalarType1Info, + ScalarType2Info, + AccTypeInfo, + scalar1_t, + scalar2_t, + accscalar_t, + vec_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, problem, stride, batch * stride, problem_along_x); + if (cfg.problem_ <= cfg.problem_wg_range_) { + if (is_first) { + launch_weight_norm_backward_reduce_kernel( + vinfo, gwinfo, rinfo, cfg); + } else { + launch_weight_norm_backward_reduce_kernel( + vinfo, gwinfo, rinfo, cfg); + } + return; + } + + Tensor carrier = at::empty( + {cfg.batch_, cfg.problem_glb_range_ / cfg.problem_wg_range_, cfg.stride_}, + map_options()); + auto cinfo = + at::xpu::detail::getTensorInfo( + carrier); + if (is_first) { + launch_weight_norm_backward_reduce_kernel(vinfo, gwinfo, cinfo, cfg); + } else { + launch_weight_norm_backward_reduce_kernel(vinfo, gwinfo, cinfo, cfg); + } + + weight_norm_backward_reduce(cinfo, gwinfo, rinfo, 1, false); + return; +} + +template < + class ScalarTypeInfo, + class AccTypeInfo, + typename scalar_t, + typename accscalar_t> +struct SegmentWeightNormBackwardKernelFunctor { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t pi = id.chunk * id.chunk_size + id.chunk_off; + + int64_t gv_lid = si + pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_; + int64_t gg_lid = id.glb_batch; + + int64_t v_off = at::xpu::detail::IndexToOffset::get( + gv_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t gw_off = at::xpu::detail::IndexToOffset::get( + gv_lid, + gwinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t gv_off = at::xpu::detail::IndexToOffset::get( + gv_lid, + gvinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t g_off = at::xpu::detail::IndexToOffset::get( + gg_lid, + ginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t n_off = at::xpu::detail::IndexToOffset::get( + gg_lid, + ninfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t r_off = at::xpu::detail::IndexToOffset::get( + gg_lid, + rinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + int64_t gg_off = at::xpu::detail::IndexToOffset::get( + gg_lid, + gginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) { + accscalar_t g = ginfo_.data[g_off]; + accscalar_t gw = gwinfo_.data[gw_off]; + accscalar_t v = vinfo_.data[v_off]; + accscalar_t n = 1.f / ninfo_.data[n_off]; + accscalar_t r = rinfo_.data[r_off]; + accscalar_t gg = r * n; + accscalar_t n3 = n * n * n; + accscalar_t gv = g * (n * gw - n3 * v * r); + + gvinfo_.data[gv_off] = static_cast(gv); + if (id.chunk == 0 && id.chunk_off == 0) + gginfo_.data[gg_off] = static_cast(gg); + } + } + SegmentWeightNormBackwardKernelFunctor( + ScalarTypeInfo vinfo, + ScalarTypeInfo ginfo, + ScalarTypeInfo gwinfo, + AccTypeInfo ninfo, + ScalarTypeInfo gvinfo, + ScalarTypeInfo gginfo, + AccTypeInfo rinfo, + BatchKernelConfig cfg) + : vinfo_(vinfo), + ginfo_(ginfo), + gwinfo_(gwinfo), + ninfo_(ninfo), + gvinfo_(gvinfo), + gginfo_(gginfo), + rinfo_(rinfo), + cfg_(cfg) {} + + private: + ScalarTypeInfo vinfo_; + ScalarTypeInfo ginfo_; + ScalarTypeInfo gwinfo_; + AccTypeInfo ninfo_; + ScalarTypeInfo gvinfo_; + ScalarTypeInfo gginfo_; + AccTypeInfo rinfo_; + BatchKernelConfig cfg_; +}; + +template +static inline void segment_weight_norm_backward( + ScalarTypeInfo& vinfo, + ScalarTypeInfo& ginfo, + ScalarTypeInfo& gwinfo, + AccTypeInfo& ninfo, + ScalarTypeInfo& gvinfo, + ScalarTypeInfo& gginfo, + AccTypeInfo& rinfo, + int dim_after_collapse) { + // segment reduce + weight_norm_backward_reduce(vinfo, gwinfo, rinfo, dim_after_collapse, true); + + // compute gradient + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using KernelClass = SegmentWeightNormBackwardKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, problem, stride, batch * stride, problem_along_x); + + KernelClass kfn(vinfo, ginfo, gwinfo, ninfo, gvinfo, gginfo, rinfo, cfg); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); + + return; +} + +template < + class ScalarTypeInfo, + class AccTypeInfo, + typename scalar_t, + typename accscalar_t, + typename vec_t> +struct WeightNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ { + void operator()(sycl::nd_item<2> item) const { + auto id = cfg_.get_item_desc(item); + int64_t n_lid = id.glb_batch; + int64_t g_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t gg_off = at::xpu::detail::IndexToOffset::get( + n_lid, + gginfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t n_off = at::xpu::detail::IndexToOffset::get( + n_lid, + ninfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + int64_t si = id.glb_batch % cfg_.stride_; + int64_t bi = id.glb_batch / cfg_.stride_; + int64_t pi = id.chunk_off; + bi = si + bi * cfg_.problem_ * cfg_.stride_; + + accscalar_t value = 0; + if (id.glb_batch < cfg_.problem_batch_) { + for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) { + int64_t v_lid, v_off, gw_off; + v_lid = bi + pi_ * cfg_.stride_; + + v_off = at::xpu::detail::IndexToOffset::get( + v_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + gw_off = at::xpu::detail::IndexToOffset::get( + v_lid, + gwinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + accscalar_t v = (accscalar_t)vinfo_.data[v_off]; + accscalar_t gw = (accscalar_t)gwinfo_.data[gw_off]; + value += v * gw; + } + } + + if (cfg_.problem_along_x_) { + value = group_x_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } else { + value = group_y_reduce( + item, shared_, vec_t(value), ReduceAdd())[0]; + } + + int n_slid = (int)id.glb_batch % batch_wg_range_; + if (id.glb_batch < cfg_.problem_batch_ && id.chunk_off == 0) { + shared_[n_slid] = value; + } + item.barrier(sycl_local_fence); + + if (id.glb_batch < cfg_.problem_batch_) { + for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) { + int64_t v_lid, v_off, gw_off, gv_off; + v_lid = bi + pi_ * cfg_.stride_; + + v_off = at::xpu::detail::IndexToOffset::get( + v_lid, + vinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + gw_off = at::xpu::detail::IndexToOffset::get( + v_lid, + gwinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + gv_off = at::xpu::detail::IndexToOffset::get( + v_lid, + gvinfo_, + at::xpu::detail::IndexToOffset:: + NON_STRICT_CONTIGUOUS); + + accscalar_t g = ginfo_.data[g_off]; + accscalar_t gw = gwinfo_.data[gw_off]; + accscalar_t v = vinfo_.data[v_off]; + accscalar_t n = 1.f / ninfo_.data[n_off]; + accscalar_t r = shared_[n_slid]; + accscalar_t gg = r * n; + accscalar_t n3 = n * n * n; + accscalar_t gv = g * (n * gw - n3 * v * r); + + gvinfo_.data[gv_off] = static_cast(gv); + if (id.chunk_off == 0) + gginfo_.data[gg_off] = static_cast(gg); + } + } + } + + void sycl_ker_config_convention(sycl::handler& cgh) { + shared_ = sycl_local_acc_t(wg_size_, cgh); + } + + WeightNormBackwardKernelFunctor( + ScalarTypeInfo vinfo, + ScalarTypeInfo ginfo, + ScalarTypeInfo gwinfo, + AccTypeInfo ninfo, + ScalarTypeInfo gvinfo, + ScalarTypeInfo gginfo, + BatchKernelConfig cfg, + int wg_size, + int batch_wg_range) + : vinfo_(vinfo), + ginfo_(ginfo), + gwinfo_(gwinfo), + ninfo_(ninfo), + gvinfo_(gvinfo), + gginfo_(gginfo), + cfg_(cfg), + wg_size_(wg_size), + batch_wg_range_(batch_wg_range) {} + + private: + ScalarTypeInfo vinfo_; + ScalarTypeInfo ginfo_; + ScalarTypeInfo gwinfo_; + AccTypeInfo ninfo_; + ScalarTypeInfo gvinfo_; + ScalarTypeInfo gginfo_; + BatchKernelConfig cfg_; + int wg_size_; + int batch_wg_range_; + sycl_local_acc_t shared_; +}; + +template +static inline void weight_norm_backward( + ScalarTypeInfo& vinfo, + ScalarTypeInfo& ginfo, + ScalarTypeInfo& gwinfo, + AccTypeInfo& ninfo, + ScalarTypeInfo& gvinfo, + ScalarTypeInfo& gginfo, + int dim_after_collapse) { + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false; + + using scalar_t = typename ScalarTypeInfo::scalar_t; + using accscalar_t = typename AccTypeInfo::scalar_t; + using vec_t = at::detail::Array; + using KernelClass = WeightNormBackwardKernelFunctor< + ScalarTypeInfo, + AccTypeInfo, + scalar_t, + accscalar_t, + vec_t>; + BatchKernelConfig cfg = BatchKernelConfig::make_config( + batch, + problem, + stride, + batch * stride, + problem_along_x, + BatchKernelConfig::Policy::pLoop); + int wg_size = cfg.group_size().size(); + int batch_wg_range = wg_size / cfg.problem_wg_range_; + KernelClass kfn( + vinfo, + ginfo, + gwinfo, + ninfo, + gvinfo, + gginfo, + cfg, + wg_size, + batch_wg_range); + sycl_kernel_submit( + cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn); + return; +} + +std::tuple weight_norm_backward_kernel( + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norms, + int64_t dim) { + auto grad_v = at::empty_like(saved_v, c10::get_contiguous_memory_format()); + auto grad_g = at::empty_like(saved_g, c10::get_contiguous_memory_format()); + + at::ScalarType scalar_acc_t = + (saved_g.scalar_type() == at::ScalarType::Half || + saved_g.scalar_type() == at::ScalarType::BFloat16) + ? at::ScalarType::Float + : saved_g.scalar_type(); + + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + saved_v.scalar_type(), + "aten::weight_norm_backward", + [&] { + auto vinfo = at::xpu::detail::getTensorInfo(saved_v); + int dim_after_collapse = vinfo.collapseDims(dim); + + auto ginfo = at::xpu::detail::getTensorInfo(saved_g); + ginfo.collapseDims(); + + auto gwinfo = at::xpu::detail::getTensorInfo(grad_w); + gwinfo.collapseDims(dim); + using accscalar_t = acc_type; + auto ninfo = + at::xpu::detail::getTensorInfo(saved_norms); + ninfo.collapseDims(); + + auto gvinfo = at::xpu::detail::getTensorInfo(grad_v); + gvinfo.collapseDims(dim); + + auto gginfo = at::xpu::detail::getTensorInfo(grad_g); + gginfo.collapseDims(); + + dim_after_collapse = 1 - dim_after_collapse; // remain dim + + int64_t batch = vinfo.outerSize(dim_after_collapse); + int64_t problem = vinfo.sizes[dim_after_collapse]; + int64_t stride = vinfo.innerSize(dim_after_collapse); + bool problem_along_x = + vinfo.strides[dim_after_collapse] == 1 ? true : false; + if (BatchKernelConfig::Policy::pSegment == + BatchKernelConfig::suggest_policy( + batch, problem, stride, problem_along_x)) { + auto reduce = at::empty( + saved_g.sizes(), + saved_g.options().dtype(scalar_acc_t), + c10::get_contiguous_memory_format()); + auto rinfo = + at::xpu::detail::getTensorInfo(reduce); + rinfo.collapseDims(); + + segment_weight_norm_backward( + vinfo, + ginfo, + gwinfo, + ninfo, + gvinfo, + gginfo, + rinfo, + dim_after_collapse); + } else { + weight_norm_backward( + vinfo, ginfo, gwinfo, ninfo, gvinfo, gginfo, dim_after_collapse); + } + }); + + return {grad_v, grad_g}; +} + +} // namespace at::native::xpu \ No newline at end of file diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h new file mode 100644 index 000000000..5b0d7afd2 --- /dev/null +++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h @@ -0,0 +1,16 @@ +#pragma once +#include + +namespace at::native::xpu { +std::tuple weight_norm_kernel( + const Tensor& v, + const Tensor& g, + int64_t dim); + +std::tuple weight_norm_backward_kernel( + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norms, + int64_t dim); +} // namespace at::native::xpu \ No newline at end of file diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 5c354e0b0..2ca8e8fc1 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -694,4 +694,6 @@ supported: - renorm.out - renorm_ - nan_to_num.out + - _weight_norm_interface + - _weight_norm_interface_backward - range.out