From fbd095a57bb146c1360e2dfe5c8cd7ef615567ae Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Tue, 23 Jul 2024 20:04:00 -0700
Subject: [PATCH 01/17] Drop CPU fallback for aten::norm.out and
 aten::nextafter.out (#641)

Fixes: #640
Fixes: 6eca394 ("Add aten::nextafter and its variants")

CC: @fengyuan14 @yucai-intel @xytintel

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/ATen/native/xpu/XPUFallback.template | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index d5ae19d5a..52b417415 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -239,8 +239,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "nanmedian",
     "nanmedian.dim_values",
     "nansum",
-    "norm.out",
-    "nextafter.out",
     "ormqr",
     "_pdist_backward",
     "_pdist_forward",

From 07ebadbd582305ca4dac2759358e8be651979df4 Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Wed, 24 Jul 2024 11:05:17 +0800
Subject: [PATCH 02/17] Update the comments of the radix sort kernel (#633)

Update the comments of the radix sort kernel.

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/sycl/SortingKernels.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h
index fb29a9f53..b39595143 100644
--- a/src/ATen/native/xpu/sycl/SortingKernels.h
+++ b/src/ATen/native/xpu/sycl/SortingKernels.h
@@ -539,9 +539,7 @@ void segmented_sort_pairs_(
     int num_elements) {
   constexpr int scaling_coef = sizeof(key_t) * sizeof(value_t) >= 64
       ? 2
-      : 1; // Attempt to reduce register pressure. The result will be incorrect
-           // when using too many local variables (registers).
-           // https://github.com/intel/torch-xpu-ops/issues/626
+      : 1; // Attempt to reduce register pressure for performance.
   if (num_elements > 4096 / scaling_coef) {
     // Considering register pressure, we use a problem size of 4096 to delineate
     // the boundary between single tile sort and group sort.

From f716a582852f562e2ad26a0381885b0509168fe9 Mon Sep 17 00:00:00 2001
From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com>
Date: Wed, 24 Jul 2024 13:54:54 +0800
Subject: [PATCH 03/17] Add aten::repeat_interleave (#564)

![image](https://github.com/intel/torch-xpu-ops/assets/105194461/1fe5b8bc-c82a-41eb-b673-7c5cb6f5d3cf)

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/ReduceOps.cpp         |  1 -
 src/ATen/native/xpu/Repeat.cpp            | 10 +++
 src/ATen/native/xpu/XPUFallback.template  |  1 -
 src/ATen/native/xpu/sycl/RepeatKernel.cpp | 79 +++++++++++++++++++++++
 src/ATen/native/xpu/sycl/RepeatKernel.h   |  9 +++
 test/xpu/xpu_test_utils.py                |  1 +
 yaml/xpu_functions.yaml                   |  1 +
 7 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 src/ATen/native/xpu/Repeat.cpp
 create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.h

diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp
index 99069eefb..826b1e27b 100644
--- a/src/ATen/native/xpu/ReduceOps.cpp
+++ b/src/ATen/native/xpu/ReduceOps.cpp
@@ -47,7 +47,6 @@ static void cum_ops_meta(
   maybe_wrap_dim(dim, self.dim());
 
   ScalarType out_dtype;
-
   if (result.defined()) {
     out_dtype = dtype.value_or(result.scalar_type());
     at::xpu::resize_out(
diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp
new file mode 100644
index 000000000..38e5ae8da
--- /dev/null
+++ b/src/ATen/native/xpu/Repeat.cpp
@@ -0,0 +1,10 @@
+#include <ATen/ATen.h>
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+namespace at {
+Tensor XPUNativeFunctions::repeat_interleave(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size) {
+  return at::native::xpu::repeat_interleave_kernel(repeats, output_size);
+}
+} // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 52b417415..356a45f70 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -248,7 +248,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "prod",
     "prod.int_out",
     "put_",
-    "repeat_interleave.Tensor",
     "replication_pad1d_backward.grad_input",
     "replication_pad1d.out",
     "replication_pad2d_backward",
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
new file mode 100644
index 000000000..c3e27ab62
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
@@ -0,0 +1,79 @@
+#include <ATen/ATen.h>
+#include <ATen/native/Repeat.h>
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+#include <comm/SYCLContext.h>
+namespace at::native::xpu {
+template <typename index_t>
+struct RepeatInterleaveKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    auto rep_ptr = rep_data_;
+    auto cum_ptr = cum_data_;
+    auto res_ptr = res_data_;
+
+    for (int64_t i = item.get_global_id(0); i < size_;
+         i += item.get_global_range()[0]) {
+      int64_t end = cum_ptr[i];
+      int64_t repeat = rep_ptr[i];
+      int64_t start = end - repeat;
+      for (int64_t j = start; j < end; j++) {
+        res_ptr[j] = i;
+      }
+    }
+  }
+  RepeatInterleaveKernelFunctor(
+      const index_t* rep_data,
+      const int64_t* cum_data,
+      index_t* res_data,
+      int64_t size,
+      int64_t result_size)
+      : rep_data_(rep_data),
+        cum_data_(cum_data),
+        res_data_(res_data),
+        size_(size),
+        result_size_(result_size) {}
+
+ private:
+  const index_t* rep_data_;
+  const int64_t* cum_data_;
+  index_t* res_data_;
+  int64_t size_;
+  int64_t result_size_;
+};
+
+template <typename index_t>
+static void compute_xpu(
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
+    index_t* result_ptr,
+    int64_t size,
+    int64_t result_size) {
+  if (size == 0)
+    return;
+
+  auto kfn = RepeatInterleaveKernelFunctor<index_t>(
+      repeat_ptr,
+      cumsum_ptr,
+      result_ptr,
+      size,
+      result_size);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int64_t local_range = size < wg_size ? size : wg_size;
+  int64_t global_range = ((size + local_range - 1) / local_range) * local_range;
+
+  auto queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
+}
+
+Tensor repeat_interleave_kernel(
+    const Tensor& repeat,
+    c10::optional<int64_t> output_size) {
+  Tensor output;
+
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_xpu", [&] {
+    output = repeat_interleave_common<index_t, compute_xpu<index_t>>(
+        repeat, output_size);
+  });
+  return output;
+}
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h
new file mode 100644
index 000000000..3df41a956
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <ATen/ATen.h>
+namespace at::native::xpu {
+
+Tensor repeat_interleave_kernel(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 3e3b5d4d1..96e43058a 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -209,6 +209,7 @@
     "aminmax",
     "argmin",
     "conj_physical",
+    "repeat_interleave",
     "fmax",
     "fmin",
     "floor",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index cd6f80b84..48ddc4c3f 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -660,6 +660,7 @@ supported:
   - ceil
   - ceil_
   - ceil.out
+  - repeat_interleave.Tensor
   - norm.ScalarOpt_dim_dtype
   - norm.dtype_out
   - norm.ScalarOpt_dim

From d25d3d09a2a01867bf613f9cc535c596832eb2e8 Mon Sep 17 00:00:00 2001
From: yuchengliu1 <yucheng.liu@intel.com>
Date: Wed, 24 Jul 2024 14:42:30 +0800
Subject: [PATCH 04/17] pow_Tensor_Scalar: fixing error result when scalar exp
 is a bool. (#638)

Fixing TestBinaryUfuncsXPU.test_pow_xpu_int

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Pow.cpp    | 8 +++++++-
 test/xpu/run_test_with_skip.py | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp
index df335bfa1..97dc5a0c2 100644
--- a/src/ATen/native/xpu/Pow.cpp
+++ b/src/ATen/native/xpu/Pow.cpp
@@ -58,7 +58,13 @@ Tensor& XPUNativeFunctions::pow_out(
 Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) {
   Tensor out;
   auto iter = pow_tensor_scalar_meta(self, exponent, out);
-  native::xpu::pow_tensor_scalar_kernel(iter, exponent);
+  if (exponent.equal(0.0) || exponent.equal(false)) {
+    iter.output().fill_(1);
+  } else if (exponent.equal(1.0) || exponent.equal(true)) {
+    iter.output().copy_(self);
+  } else {
+    native::xpu::pow_tensor_scalar_kernel(iter, exponent);
+  }
   return iter.output();
 }
 
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 0e00da513..3ffc3feb6 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -797,12 +797,13 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 skip_list = (
     "test_fmod_remainder_by_zero_integral_xpu_int64",  # zero division is an undefined behavior: different handles on different backends
     "test_div_rounding_numpy_xpu_float16",  # Calculation error. XPU implementation uses opmath type.
-    # RuntimeError: false INTERNAL ASSERT FAILED at "torch-xpu-ops/src/ATen/native/xpu/sycl/PowKernels.cpp":233, please report a bug to PyTorch. invalid combination of type in Pow function, common dtype: Short, exp is integral? 0
+    # fail in complex_exponents=[-1.0 - 1.5j, 3.3j]
+    # Mismatched elements: 33 / 100 (33.0%)
+    # Greatest absolute difference: 0.00038337233127094805 at index (4,) (up to 1e-05 allowed)
+    # Greatest relative difference: 1.9085073290625587e-06 at index (6,) (up to 1.3e-06 allowed)
     "test_pow_xpu_int16",
     "test_pow_xpu_int32",
     "test_pow_xpu_int64",
-    "test_pow_xpu_int8",
-    "test_pow_xpu_uint8",
     # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
     "_jiterator_",
     # Unexpected success

From e1e195bd4d6f9541fc05c54579242f07f30a63b3 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:55:52 +0800
Subject: [PATCH 05/17] Support XPU ABI=0 build (#531)

# Motivation
Support XPU ABI neutral build.
Starting from compiler 2025.0, `libsycl.so` is ABI-neutral lib. Before
this, the `libsycl-preview.so` is ABI-neutral.

refer to https://github.com/pytorch/pytorch/pull/130110
---
 cmake/BuildFlags.cmake                  |  6 +-
 cmake/Modules/FindSYCL.cmake            |  5 +-
 cmake/Modules/FindSYCLToolkit.cmake     | 73 ++++++-------------------
 cmake/SYCL.cmake                        | 18 ------
 src/ATen/CMakeLists.txt                 |  2 +-
 src/ATen/native/xpu/sycl/AmpKernels.cpp |  4 +-
 src/CMakeLists.txt                      |  2 +-
 test/sycl/CMakeLists.txt                |  2 +-
 8 files changed, 28 insertions(+), 84 deletions(-)

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index d0eb3e299..2f05d1ab0 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -59,10 +59,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
+    # Equivalent to build option -fpreview-breaking-changes for SYCL compiler.
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
   endif()
-  # TODO: Align with PyTorch and switch to ABI=0 eventually, after
-  # resolving incompatible implementation in SYCL runtime.
-  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1)
   set(SYCL_FLAGS ${SYCL_FLAGS} ${SYCL_KERNEL_OPTIONS})
 
   set(TORCH_XPU_OPS_FLAGS ${SYCL_HOST_FLAGS})
diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake
index b8fc8ba22..4e266a26f 100644
--- a/cmake/Modules/FindSYCL.cmake
+++ b/cmake/Modules/FindSYCL.cmake
@@ -407,7 +407,6 @@ macro(SYCL_LINK_DEVICE_OBJECTS output_file sycl_target)
       OUTPUT ${output_file}
       DEPENDS ${object_files}
       COMMAND ${SYCL_EXECUTABLE}
-      -fsycl
       ${SYCL_device_link_flags}
       -fsycl-link ${object_files}
       -Xs "\"${SYCL_OFFLINE_COMPILER_FLAGS}\""
@@ -471,7 +470,7 @@ macro(SYCL_ADD_LIBRARY sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
-    ${SYCL_LIBRARIES})
+    ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
     PROPERTIES
@@ -530,7 +529,7 @@ macro(SYCL_ADD_EXECUTABLE sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
-    ${SYCL_LIBRARIES})
+    ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
     PROPERTIES
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
index e478cb4c5..46e34c7f8 100644
--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -25,16 +25,20 @@ This will define the following variables:
 
 #]=======================================================================]
 
-set(SYCLTOOLKIT_FOUND False)
-include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+include(${TORCH_ROOT}/cmake/Modules/FindSYCLToolkit.cmake)
 
-set(SYCL_ROOT "")
-if(DEFINED ENV{SYCL_ROOT})
-  set(SYCL_ROOT $ENV{SYCL_ROOT})
-elseif(DEFINED ENV{CMPLR_ROOT})
-  set(SYCL_ROOT $ENV{CMPLR_ROOT})
+if(NOT SYCL_FOUND)
+  set(SYCLTOOLKIT_FOUND FALSE)
+  return()
 endif()
 
+if(SYCLTOOLKIT_FOUND)
+  return()
+endif()
+set(SYCLTOOLKIT_FOUND TRUE)
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+
 if(WIN32)
   set(SYCL_EXECUTABLE_NAME icx)
 else()
@@ -71,43 +75,6 @@ if(nocmplr)
   set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
 endif()
 
-find_file(
-  SYCL_INCLUDE_DIR
-  NAMES include
-  HINTS ${SYCL_ROOT}
-  NO_DEFAULT_PATH
-  )
-
-find_file(
-  SYCL_INCLUDE_SYCL_DIR
-  NAMES sycl
-  HINTS ${SYCL_ROOT}/include
-  NO_DEFAULT_PATH
-  )
-
-list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR})
-
-find_file(
-  SYCL_LIBRARY_DIR
-  NAMES lib lib64
-  HINTS ${SYCL_ROOT}
-  NO_DEFAULT_PATH
-  )
-
-find_library(
-  SYCL_LIBRARY
-  NAMES sycl
-  HINTS ${SYCL_LIBRARY_DIR}
-  NO_DEFAULT_PATH
-)
-
-if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY))
-  set(SYCLTOOLKIT_FOUND False)
-  set(SYCL_REASON_FAILURE "SYCL sdk is incomplete!!")
-  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
-  return()
-endif()
-
 # Function to write a test case to verify SYCL features.
 
 function(SYCL_CMPLR_TEST_WRITE src)
@@ -202,6 +169,13 @@ set(SYCL_FLAGS "")
 set(SYCL_LINK_FLAGS "")
 list(APPEND SYCL_FLAGS "-fsycl")
 list(APPEND SYCL_LINK_FLAGS "-fsycl")
+if(LINUX)
+  string(REGEX MATCH "libsycl-preview.so" is_abi_neutral ${SYCL_LIBRARY})
+  if(is_abi_neutral)
+    list(APPEND SYCL_FLAGS "-fpreview-breaking-changes")
+    list(APPEND SYCL_LINK_FLAGS "-fpreview-breaking-changes")
+  endif()
+endif()
 
 set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}")
 
@@ -249,14 +223,3 @@ endif()
 
 message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
 message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
-
-# Avoid module variables conflict due to calling find_package recursively
-# e.g. find_package -> add_subdirectory(entering in a sub-project) -> find_package
-# find_package_handle_standard_args(
-#   SYCLToolkit
-#   FOUND_VAR SYCLTOOLKIT_FOUND
-#   REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY SYCL_FLAGS
-#   VERSION_VAR SYCL_LANGUAGE_VERSION
-#   REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}")
-set(SYCLTOOLKIT_FOUND True)
-
diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake
index b2bbadaa4..f0724ef55 100644
--- a/cmake/SYCL.cmake
+++ b/cmake/SYCL.cmake
@@ -25,24 +25,6 @@ if(NOT SYCL_VERSION)
   return()
 endif()
 
-find_library(SYCL_LIBRARIES sycl HINTS ${SYCL_LIBRARY_DIR})
-# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
-# where the current version of the SYCL runtime is 7.
-# Until oneAPI adds support to sycl.lib without the version suffix,
-# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
-# TODO: remove this when sycl.lib is supported on Windows
-if(WIN32)
-  set(sycl_runtime_version 7)
-  find_library(
-    SYCL_LIBRARIES
-    NAMES "sycl${sycl_runtime_version}"
-    HINTS ${SYCL_LIBRARY_DIR}
-  )
-  if(SYCL_LIBRARIES STREQUAL "SYCL_LIBRARIES-NOTFOUND")
-    message(FATAL_ERROR "Cannot find a SYCL library on Windows")
-  endif()
-endif()
-
 set(SYCL_COMPILER_VERSION)
 file(READ ${SYCL_VERSION} version_contents)
 string(REGEX MATCHALL "__SYCL_COMPILER_VERSION +[0-9]+" VERSION_LINE "${version_contents}")
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index c95e860fa..815ad018f 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp", "native/xpu/*.cpp", "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp
index 5e8d258a6..87354eea9 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp
@@ -102,9 +102,9 @@ void amp_foreach_non_finite_check_and_unscale_kernel(
 }
 
 struct AmpUpdateScaleKernelFunctor {
-  void operator()(sycl::item<1> item) const {
+  void operator()(sycl::nd_item<1> item) const {
     // There is only single item/task scheduled.
-    if (item.get_linear_id() != 0)
+    if (item.get_global_linear_id() != 0)
       return;
 
     if (*found_inf_) {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5e0a8549..fa17491a2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,7 +82,7 @@ foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
   target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
 
-  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARIES})
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
 endforeach()
 
 include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
diff --git a/test/sycl/CMakeLists.txt b/test/sycl/CMakeLists.txt
index 0671fbc94..5bbee8643 100644
--- a/test/sycl/CMakeLists.txt
+++ b/test/sycl/CMakeLists.txt
@@ -42,7 +42,7 @@ add_dependencies(test_sycl_build_archive sycl_simple_kernel_test)
 # Instead, we use explicit linkage option '--whole-archive', which is required
 # by linkage of device object modules archived in the static library. Then
 # explicit linkage configuration of SYCL runtime library is required.
-target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARIES})
+target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARY})
 
 if(INSTALL_TEST)
   install(TARGETS test_sycl_build_archive DESTINATION bin)

From fcaa34cb5c740c81fab2126b0a68e5f4c724324a Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Thu, 25 Jul 2024 08:45:49 +0800
Subject: [PATCH 06/17] Add aten::histogram and variant (#569)

CPU/MPS only ops

---------

Signed-off-by: majing <Jing1.Ma@intel.com>
Signed-off-by: Ma, Jing1 <jing1.ma@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Histogram.cpp             | 195 +++++++++++++++
 src/ATen/native/xpu/sycl/HistogramKernels.h   |  23 ++
 .../native/xpu/sycl/HistogramddKernels.cpp    | 227 ++++++++++++++++++
 test/xpu/extended/run_test_with_skip.py       |   5 +
 test/xpu/extended/test_ops_xpu.py             |   6 +
 test/xpu/run_test_with_skip.py                |   4 +
 test/xpu/xpu_test_utils.py                    |   1 +
 yaml/xpu_functions.yaml                       |   4 +
 8 files changed, 465 insertions(+)
 create mode 100644 src/ATen/native/xpu/Histogram.cpp
 create mode 100644 src/ATen/native/xpu/sycl/HistogramKernels.h
 create mode 100644 src/ATen/native/xpu/sycl/HistogramddKernels.cpp

diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp
new file mode 100644
index 000000000..de49952b2
--- /dev/null
+++ b/src/ATen/native/xpu/Histogram.cpp
@@ -0,0 +1,195 @@
+#include <ATen/Context.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/xpu/sycl/HistogramKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+namespace at {
+
+/* Checks properties of input tensors input, bins, and weight.
+ */
+void histogramdd_check_inputs(
+    const Tensor& input,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight) {
+  if (weight.has_value()) {
+    TORCH_CHECK(
+        weight->device() == input.device(),
+        "weight and input need to be on the same device.")
+  }
+  auto input_dtype = input.dtype();
+  auto bins_dtype = bins.dtype();
+  TORCH_CHECK(
+      input_dtype == bins_dtype,
+      "torch.histogramdd: input tensor and bins tensors should",
+      " have the same dtype, but got input with dtype ",
+      input_dtype,
+      " and bins with dtype ",
+      bins_dtype);
+
+  const int64_t bins_dim = bins.dim();
+  TORCH_CHECK(
+      bins_dim == 1,
+      "torch.histogramdd: bins tensor should have one dimension,",
+      " but got ",
+      bins_dim,
+      " dimensions in the bin tensor");
+
+  const int64_t numel = bins.numel();
+  TORCH_CHECK(
+      numel > 0,
+      "torch.histogramdd: bins tensor should have at least 1 element,",
+      " but got ",
+      numel,
+      " elements in the bin tensor");
+
+  if (weight.has_value()) {
+    TORCH_CHECK(
+        input.dtype() == weight.value().dtype(),
+        "torch.histogramdd: if weight tensor is provided, ",
+        "input tensor and weight tensor should have the same dtype, ",
+        "but got input(",
+        input.dtype(),
+        ")",
+        ", and weight(",
+        weight.value().dtype(),
+        ")");
+
+    auto input_sizes = input.sizes().vec();
+
+    auto weight_sizes = weight.value().sizes().vec();
+    if (weight_sizes.empty()) {
+      // correctly handle scalars
+      weight_sizes = {1};
+    }
+
+    TORCH_CHECK(
+        input_sizes == weight_sizes,
+        "torch.histogramdd: if weight tensor is provided it should have",
+        " the same shape as the input tensor excluding its innermost ",
+        "dimension, but got input with shape ",
+        input.sizes(),
+        " and weight ",
+        "with shape ",
+        weight.value().sizes());
+  }
+}
+
+/* Checks properties of output tensors hist and bin_edges, then resizes them.
+ */
+void histogramdd_prepare_out(
+    const Tensor& input,
+    int64_t bin_ct,
+    const Tensor& hist,
+    const Tensor& bin_edges) {
+  TORCH_CHECK(
+      input.dtype() == hist.dtype(),
+      "torch.histogram: input tensor and hist tensor should",
+      " have the same dtype, but got input ",
+      input.dtype(),
+      " and hist ",
+      hist.dtype());
+
+  TORCH_CHECK(
+      input.dtype() == bin_edges.dtype(),
+      "torch.histogram: input tensor and bin_edges tensor should",
+      " have the same dtype, but got input ",
+      input.dtype(),
+      " and bin_edges ",
+      bin_edges.dtype());
+
+  TORCH_CHECK(
+      bin_ct > 0, "torch.histogram(): bins must be > 0, but got ", bin_ct);
+
+  at::native::resize_output(bin_edges, {bin_ct + 1});
+
+  at::native::resize_output(hist, {bin_ct});
+}
+
+void histogramdd_prepare_out(
+    const Tensor& input,
+    const Tensor& bins,
+    const Tensor& hist,
+    const Tensor& bin_edges) {
+  int64_t bin_ct = bins.numel() - 1;
+  histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
+}
+
+static Tensor& histogramdd_out(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  histogramdd_check_inputs(self, bins, weight);
+  histogramdd_prepare_out(self, bins, hist, bin_edges);
+
+  bin_edges.copy_(bins);
+
+  at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges);
+  return hist;
+}
+
+std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  Tensor reshaped_self = self.reshape({self.numel()});
+  std::optional<Tensor> reshaped_weight = weight.has_value()
+      ? weight.value().reshape({weight.value().numel()})
+      : weight;
+
+  histogramdd_out(
+      reshaped_self, bins, reshaped_weight, density, hist, bin_edges);
+
+  return std::forward_as_tuple(hist, bin_edges);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density) {
+  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
+  Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous);
+  return histogram_out(self, bins, weight, density, hist, bin_edges);
+}
+
+std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  Tensor reshaped_self = self.reshape({self.numel()});
+  std::optional<Tensor> reshaped_weight = weight.has_value()
+      ? weight.value().reshape({weight.value().numel()})
+      : weight;
+
+  histogramdd_prepare_out(reshaped_self, bin_ct, hist, bin_edges);
+  histogramdd_check_inputs(reshaped_self, bin_edges, reshaped_weight);
+
+  at::native::xpu::histogramdd_linear_kernel(
+      reshaped_self, bin_ct, range, reshaped_weight, density, hist, bin_edges);
+  return std::forward_as_tuple(hist, bin_edges);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density) {
+  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
+  Tensor bin_edges_out = at::empty({0}, self.options());
+  return histogram_out(
+      self, bin_ct, range, weight, density, hist, bin_edges_out);
+}
+
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h
new file mode 100644
index 000000000..da153186a
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/HistogramKernels.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+
+void histogramdd_kernel(
+    const Tensor& self,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    const Tensor& bin_edges);
+
+void histogramdd_linear_kernel(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& out_bin_edges);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
new file mode 100644
index 000000000..be888e4b4
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
@@ -0,0 +1,227 @@
+#pragma clang diagnostic push
+#pragma GCC diagnostic push
+// Avoid SYCL compiler return-type error
+#pragma clang diagnostic ignored "-Wreturn-type"
+#pragma GCC diagnostic ignored "-Wreturn-type"
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/xpu/sycl/Atomics.h>
+#include <comm/SYCLContext.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/linspace.h>
+#endif
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct HistogramddKernelFunctor {
+  void operator()(sycl::nd_item<1> item_id) const {
+    int64_t wi_id = item_id.get_global_id();
+    if (wi_id < input_size_ * bin_size_) {
+      int64_t ele_idx = wi_id / bin_size_;
+      int64_t bin_idx = wi_id % bin_size_;
+
+      // [left, right)
+      if (input_[ele_idx] >= bin_edges_[bin_idx] &&
+          input_[ele_idx] < bin_edges_[bin_idx + 1]) {
+        scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin_idx), value);
+        return;
+      }
+
+      // For last bin, [left, right]
+      if (bin_idx == 0 && input_[ele_idx] == bin_edges_[bin_size_]) {
+        scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin_size_ - 1), value);
+      }
+    }
+  }
+
+  HistogramddKernelFunctor(
+      const scalar_t* input,
+      const scalar_t* bin_edges,
+      scalar_t* hist,
+      const scalar_t* weight,
+      int64_t input_size,
+      int64_t bin_size)
+      : input_(input),
+        bin_edges_(bin_edges),
+        hist_(hist),
+        weight_(weight),
+        input_size_(input_size),
+        bin_size_(bin_size) {}
+
+ private:
+  const scalar_t* input_;
+  const scalar_t* bin_edges_;
+  scalar_t* hist_;
+  const scalar_t* weight_;
+  int64_t input_size_;
+  int64_t bin_size_;
+};
+
+// For one dimension case
+template <typename scalar_t>
+void histogramdd_template(
+    const scalar_t* input,
+    const scalar_t* bin_edges,
+    scalar_t* hist,
+    const scalar_t* weight,
+    int64_t input_size,
+    int64_t bin_size) {
+  HistogramddKernelFunctor<scalar_t> kfn(
+      input, bin_edges, hist, weight, input_size, bin_size);
+  const int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg =
+      (input_size * bin_size + work_group_size - 1) / work_group_size;
+  sycl_kernel_submit(
+      num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn);
+}
+
+template <typename scalar_t>
+struct HistogramddLinearKernelFunctor {
+  void operator()(sycl::nd_item<1> item_id) const {
+    int64_t wi_id = item_id.get_global_id();
+    if (wi_id < input_size_) {
+      scalar_t i_value = input_[wi_id];
+      if (i_value >= leftmost_edge_ && i_value <= rightmost_edge_) {
+        int64_t bin =
+            (int64_t)(((i_value - leftmost_edge_)) * bin_size_ / (rightmost_edge_ - leftmost_edge_));
+        if (bin == bin_size_)
+          bin -= 1;
+        scalar_t value = weight_ ? weight_[wi_id] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin), value);
+      }
+    }
+  }
+
+  HistogramddLinearKernelFunctor(
+      const scalar_t* input,
+      scalar_t* hist,
+      const scalar_t* weight,
+      int64_t input_size,
+      int64_t bin_size,
+      double leftmost_edge,
+      double rightmost_edge)
+      : input_(input),
+        hist_(hist),
+        weight_(weight),
+        input_size_(input_size),
+        bin_size_(bin_size),
+        leftmost_edge_(leftmost_edge),
+        rightmost_edge_(rightmost_edge) {}
+
+ private:
+  const scalar_t* input_;
+  scalar_t* hist_;
+  const scalar_t* weight_;
+  int64_t input_size_;
+  int64_t bin_size_;
+  double leftmost_edge_;
+  double rightmost_edge_;
+};
+
+// For one dimension case
+template <typename scalar_t>
+void histogramdd_linear_template(
+    const scalar_t* input,
+    scalar_t* hist,
+    const scalar_t* weight,
+    int64_t input_size,
+    int64_t bin_size,
+    double leftmost_edge,
+    double rightmost_edge) {
+  HistogramddLinearKernelFunctor<scalar_t> kfn(
+      input, hist, weight, input_size, bin_size, leftmost_edge, rightmost_edge);
+  const int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (input_size + work_group_size - 1) / work_group_size;
+  sycl_kernel_submit(
+      num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn);
+}
+
+void histogramdd_kernel(
+    const Tensor& self,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    const Tensor& bin_edges_) {
+  globalContext().alertNotDeterministic("histogramdd_kernel_xpu");
+  hist.fill_(0);
+  Tensor bin_edges = bin_edges_.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, self.scalar_type(), "histogram_xpu", [&]() {
+        histogramdd_template<scalar_t>(
+            self.data_ptr<scalar_t>(),
+            bin_edges.data_ptr<scalar_t>(),
+            hist.data_ptr<scalar_t>(),
+            weight.has_value() ? weight->data_ptr<scalar_t>() : nullptr,
+            self.numel(),
+            bin_edges.numel() - 1);
+      });
+
+  if (density) {
+    const auto hist_sum = hist.sum();
+    hist.div_(hist_sum);
+    Tensor bin_lengths = bin_edges.diff();
+    hist.div_(bin_lengths);
+  }
+}
+
+void histogramdd_linear_kernel(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& out_bin_edges) {
+  globalContext().alertNotDeterministic("histogramdd_linear_kernel_xpu");
+  hist.fill_(0);
+
+  // default range for empty input
+  double leftmost_edge = 0., rightmost_edge = 1.;
+  if (range.has_value()) {
+    leftmost_edge = range.value()[0];
+    rightmost_edge = range.value()[1];
+  } else if (self.numel() > 0) {
+    auto extrema = at::aminmax(self);
+    leftmost_edge = std::get<0>(extrema).item<double>();
+    rightmost_edge = std::get<1>(extrema).item<double>();
+  }
+
+  if (leftmost_edge == rightmost_edge) {
+    leftmost_edge -= 0.5;
+    rightmost_edge += 0.5;
+  }
+
+  at::linspace_out(out_bin_edges, leftmost_edge, rightmost_edge, bin_ct + 1);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, self.scalar_type(), "histogram_linear_xpu", [&]() {
+        histogramdd_linear_template<scalar_t>(
+            self.data_ptr<scalar_t>(),
+            hist.data_ptr<scalar_t>(),
+            weight.has_value() ? weight->data_ptr<scalar_t>() : nullptr,
+            self.numel(),
+            bin_ct,
+            leftmost_edge,
+            rightmost_edge);
+      });
+
+  if (density) {
+    const auto hist_sum = hist.sum();
+    hist.div_(hist_sum);
+    Tensor bin_lengths = out_bin_edges.diff();
+    hist.div_(bin_lengths);
+  }
+}
+
+} // namespace at::native::xpu
+
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
\ No newline at end of file
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 76790d817..a05d47e05 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -163,6 +163,11 @@
     "test_compare_cpu_std_mean_xpu_bfloat16",
     "test_compare_cpu_sub_xpu_float16",
     "test_compare_cpu_var_mean_xpu_bfloat16",
+
+    # test case doesn't make sense, will file an issue to track it.
+    # https://github.com/pytorch/pytorch/issues/130916
+    "test_compare_cpu_histogram_xpu_float32",
+    "test_compare_cpu_histogram_xpu_float64",
 )
 
 
diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py
index 792c7c569..1d9d3df6d 100644
--- a/test/xpu/extended/test_ops_xpu.py
+++ b/test/xpu/extended/test_ops_xpu.py
@@ -81,6 +81,12 @@ def test_compare_cpu(self, device, dtype, op):
             self.proxy = Namespace.TestCommonProxy()
             test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
             test_common_test_fn(self.proxy, device, dtype, op)
+        # for CUDA doesn't support operators
+        elif (op.name in ["histogram",]):
+            if dtype in op.dtypes:
+                self.proxy = Namespace.TestCommonProxy()
+                test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
+                test_common_test_fn(self.proxy, device, dtype, op)
         else:
             pytest.skip(f"{op.name} has not supported {dtype} yet both for cpu and xpu")
 
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 3ffc3feb6..313a2e8b3 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -783,6 +783,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf'
     "test_dtypes_cosh_xpu",
 
+    # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
+    # but test_dtypes infrastructure leverage CUDA supported datatypes
+    "test_dtypes_histogram_xpu",
+
     # The following dtypes worked in forward but are not listed by the OpInfo: {torch.float16}.
     # Align with CPU implementation since,
     # 1. most cases of nextafter require Half dtype.
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 96e43058a..6d989fa14 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -209,6 +209,7 @@
     "aminmax",
     "argmin",
     "conj_physical",
+    "histogram",
     "repeat_interleave",
     "fmax",
     "fmin",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 48ddc4c3f..224c0e7d8 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -660,6 +660,10 @@ supported:
   - ceil
   - ceil_
   - ceil.out
+  - histogram.bins_tensor
+  - histogram.bins_tensor_out
+  - histogram.bin_ct
+  - histogram.bin_ct_out
   - repeat_interleave.Tensor
   - norm.ScalarOpt_dim_dtype
   - norm.dtype_out

From bbd200abe2d3a3e290c45e04127723f65293ed7e Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Thu, 25 Jul 2024 18:27:53 +0800
Subject: [PATCH 07/17] Timm_regnet regression which tracked in issue 603
 (#650)

Skip check and tracked in
https://github.com/intel/torch-xpu-ops/issues/603
---
 .github/ci_expected_accuracy/inductor_torchbench_training.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
index 2984ecbc9..dc766eac0 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -93,7 +93,7 @@ tacotron2,fail_to_run,fail_to_run,fail_to_run,fail_to_run,fail_to_run
 timm_efficientdet,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
 timm_efficientnet,pass,pass,pass,pass,pass
 timm_nfnet,pass,pass,pass,pass,pass
-timm_regnet,pass,pass,pass,pass,pass
+timm_regnet,pass,fail_accuracy,pass,pass,pass
 timm_resnest,pass,pass,pass,pass,pass
 timm_vision_transformer,pass,pass,pass,pass,pass
 timm_vision_transformer_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip

From c2405f25a2a886b91f5a84432721511db977a5dd Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Sat, 27 Jul 2024 16:44:35 +0800
Subject: [PATCH 08/17] Add aten::nansum and aten::nansum.out (#567)

Add aten::nansum and aten::nansum.out.

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/ReduceOps.cpp             | 34 ++++++++
 src/ATen/native/xpu/XPUFallback.template      |  1 -
 src/ATen/native/xpu/sycl/ReduceOpsKernels.h   |  2 +
 .../native/xpu/sycl/ReduceSumProdKernels.cpp  | 82 ++++++++++++++++---
 test/xpu/extended/run_test_with_skip.py       |  2 +
 test/xpu/xpu_test_utils.py                    |  2 +
 yaml/xpu_functions.yaml                       |  2 +
 7 files changed, 111 insertions(+), 14 deletions(-)

diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp
index 826b1e27b..33fb03823 100644
--- a/src/ATen/native/xpu/ReduceOps.cpp
+++ b/src/ATen/native/xpu/ReduceOps.cpp
@@ -869,6 +869,40 @@ Tensor XPUNativeFunctions::amin(
   return out;
 }
 
+Tensor& XPUNativeFunctions::nansum_out(
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype,
+    Tensor& result) {
+  // For integral types, use existing sum as
+  // integral types don't have `Nan`.
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return at::sum_out(result, self, dim, keepdim, opt_dtype);
+  }
+
+  auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result);
+  result = resize_reduction(result, self, dim, keepdim, out_dtype);
+  auto iter = meta::make_reduction_from_out_ty(
+      self, result, dim, keepdim, result.scalar_type());
+
+  if (iter.numel() == 0) {
+    result = result.zero_();
+  } else {
+    native::xpu::nansum_kernel(iter);
+  }
+  return result;
+}
+
+Tensor XPUNativeFunctions::nansum(
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    std::optional<ScalarType> opt_dtype) {
+  Tensor result;
+  return XPUNativeFunctions::nansum_out(self, dim, keepdim, opt_dtype, result);
+}
+
 static ScalarType get_result_or_self_value_dtype(
     const Tensor& self,
     const Tensor& result,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 356a45f70..b8969548c 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -238,7 +238,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "multi_margin_loss_backward",
     "nanmedian",
     "nanmedian.dim_values",
-    "nansum",
     "ormqr",
     "_pdist_backward",
     "_pdist_forward",
diff --git a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
index 9ee0ee955..4a54766ae 100644
--- a/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/ReduceOpsKernels.h
@@ -16,6 +16,8 @@ void mean_kernel(TensorIterator& iter);
 
 void sum_kernel(TensorIterator& iter);
 
+void nansum_kernel(TensorIterator& iter);
+
 void std_var_kernel(TensorIterator& iter, double correction, bool take_sqrt);
 
 void aminmax_kernel(TensorIterator& iter);
diff --git a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
index 728a75582..d68ca376b 100644
--- a/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceSumProdKernels.cpp
@@ -1,5 +1,6 @@
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
+#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
@@ -8,6 +9,36 @@ namespace at {
 namespace native {
 namespace xpu {
 
+// The function `reduce_dispatch` below dispatches to the kernel based
+// on the type of `iter`. It takes care of the common logic
+// for handling Half-Precision floating types.
+// Otherwise the functor `op` is called to dispatch to the kernel
+// of relevant type.
+//
+// Note: Functor `op` should take care of all the types to be supported
+//       except for `at::Half` and `at::BFloat16`.
+template <
+    template <
+        typename scalar_t,
+        typename acc_t = scalar_t,
+        typename out_t = scalar_t>
+    typename OpFunctor,
+    typename GeneralDispatcher>
+static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) {
+  if (iter.dtype() == kHalf) {
+    return OpFunctor<at::Half, float>{}(iter);
+  } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return OpFunctor<at::Half, float, float>{}(iter);
+  } else if (iter.dtype() == kBFloat16) {
+    return OpFunctor<at::BFloat16, float>{}(iter);
+  } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return OpFunctor<at::BFloat16, float, float>{}(iter);
+  }
+  op(iter);
+}
+
 template <typename acc_t>
 struct SumFunctor {
   inline acc_t operator()(acc_t a, acc_t b) const {
@@ -36,22 +67,47 @@ struct sum_functor {
 };
 
 void sum_kernel(TensorIterator& iter) {
-  if (iter.dtype() == kHalf) {
-    return sum_functor<at::Half, float>{}(iter);
-  } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
-    return sum_functor<at::Half, float, float>{}(iter);
-  } else if (iter.dtype() == kBFloat16) {
-    return sum_functor<at::BFloat16, float>{}(iter);
-  } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
-    // type promotion that does cast and reduction in a single kernel
-    return sum_functor<at::BFloat16, float, float>{}(iter);
+  auto general_dispatcher = [](TensorIterator& iter) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        kBool, kComplexHalf, iter.dtype(), "sum_xpu", [&]() {
+          sum_functor<scalar_t>{}(iter);
+        });
+  };
+  reduce_dispatch<sum_functor>(iter, general_dispatcher);
+}
+
+template <
+    typename scalar_t,
+    typename acc_t = scalar_t,
+    typename out_t = scalar_t>
+struct NansumFunctor {
+  void operator()(TensorIterator& iter) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NanSumOps<acc_t, out_t>{});
   }
+};
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-      kBool, kComplexHalf, iter.dtype(), "sum_xpu", [&]() {
-        sum_functor<scalar_t>{}(iter);
+template <typename scalar_t>
+struct NansumComplexFunctor {
+  void operator()(TensorIterator& iter) {
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(iter, NanSumOps<acc_t, scalar_t>{});
+  }
+};
+
+void nansum_kernel(TensorIterator& iter) {
+  auto general_dispatcher = [](TensorIterator& iter) {
+    auto dtype = iter.dtype();
+    if (at::isComplexType(dtype)) {
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "nansum_xpu", [&]() {
+        NansumComplexFunctor<scalar_t>{}(iter);
+      });
+    } else {
+      AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_xpu", [&]() {
+        NansumFunctor<scalar_t>{}(iter);
       });
+    }
+  };
+  reduce_dispatch<NansumFunctor>(iter, general_dispatcher);
 }
 
 } // namespace xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index a05d47e05..37f4ab220 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -127,6 +127,8 @@
     "test_compare_cpu__batch_norm_with_update_xpu_bfloat16",
     "test_compare_cpu__batch_norm_with_update_xpu_float16",
     "test_compare_cpu_nn_functional_huber_loss_xpu_bfloat16",
+    "test_compare_cpu_nansum_xpu_bfloat16",
+    "test_compare_cpu_nanmean_xpu_bfloat16",
     # Align with CUDA impl by using accumulate type. But CPU doesn't use.
     # When XPU uses original data type, the case passes.
     "test_compare_cpu_logit_xpu_bfloat16",
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 6d989fa14..e8c0dc55b 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -136,6 +136,7 @@
     "atanh",
     "sqrt",
     "sum",
+    "nansum",
     "amin",
     "amax",
     "std",
@@ -218,6 +219,7 @@
     "copysign",
     "count_nonzero",
     "nan_to_num",
+    "nanmean",
 ]
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 224c0e7d8..92ab06e74 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -339,6 +339,8 @@ supported:
   - min.dim_min
   - sum.dim_IntList
   - sum.IntList_out
+  - nansum
+  - nansum.out
   - mean.out
   - mean.dim
   - std.correction

From c1f99eaac5fda36d230d58a92489713706c0d2f1 Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Sat, 27 Jul 2024 16:46:03 +0800
Subject: [PATCH 09/17] Add aten::masked_select and its variants (#649)

Task list:
- [x] masked_select
- [x] masked_select.out

Fixes: #629
---
 src/ATen/native/xpu/Indexing.cpp          | 54 ++++++++++++++++++++++-
 src/ATen/native/xpu/TensorTopK.cpp        |  5 ++-
 src/ATen/native/xpu/XPUFallback.template  |  1 -
 src/ATen/native/xpu/sycl/RepeatKernel.cpp |  6 +--
 test/xpu/run_test_with_skip.py            |  1 -
 test/xpu/xpu_test_utils.py                |  1 +
 yaml/xpu_functions.yaml                   |  2 +
 7 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index 42321e7b7..5db9a7238 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -1,10 +1,11 @@
 #include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/core/op_registration/adaption.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/IndexingKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/TensorInfo.h>
 
 namespace at {
@@ -44,4 +45,53 @@ Tensor XPUNativeFunctions::index_select(
   return index_select_out(self, dim, index, out);
 }
 
+static Tensor& masked_select_out_impl(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mask) {
+  NoNamesGuard guard;
+
+  TORCH_CHECK(
+      mask.scalar_type() == ScalarType::Bool,
+      "masked_select: expected BoolTensor for mask");
+  TORCH_CHECK(
+      self.scalar_type() == result.scalar_type(),
+      "masked_select(): self and result must have the same scalar type");
+
+  auto mask_temp = (mask.dim() == 0)
+      ? c10::MaybeOwned<Tensor>::owned(mask.unsqueeze(0))
+      : c10::MaybeOwned<Tensor>::borrowed(mask);
+  auto self_temp = (self.dim() == 0)
+      ? c10::MaybeOwned<Tensor>::owned(self.unsqueeze(0))
+      : c10::MaybeOwned<Tensor>::borrowed(self);
+
+  // Cannot reassign to mask_temp and self_temp here! if they are
+  // owning and expand_outplace returns a borrow, the returned borrow
+  // would dangle.
+  auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
+  XPUNativeFunctions::index_out(
+      *std::get<1>(mask_self_expanded),
+      c10::List<std::optional<at::Tensor>>(
+          {*std::move(std::get<0>(mask_self_expanded))}),
+      result);
+
+  return result;
+}
+
+Tensor XPUNativeFunctions::masked_select(
+    const Tensor& self,
+    const Tensor& mask) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  Tensor result = at::empty({0}, self.options());
+  return masked_select_out_impl(result, self, mask);
+}
+
+Tensor& XPUNativeFunctions::masked_select_out(
+    const Tensor& self,
+    const Tensor& mask,
+    Tensor& result) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  return masked_select_out_impl(result, self, mask);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp
index a9c5836a7..3961160bf 100644
--- a/src/ATen/native/xpu/TensorTopK.cpp
+++ b/src/ATen/native/xpu/TensorTopK.cpp
@@ -55,8 +55,9 @@ void topk_out_impl(
   TORCH_CHECK(
       k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
       "selected index k out of range");
-  
-  // If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
+
+  // If k is 0 the result is an empty tensor, so we don't need to launch a
+  // kernel.
   if (k == 0) {
     return;
   }
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index b8969548c..7bcc67ca5 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -225,7 +225,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "logspace.out",
     "lu_unpack.out",
     "masked_scatter_",
-    "masked_select",
     "max_pool3d_with_indices",
     "max_pool3d_with_indices_backward",
     "max_unpool2d",
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
index c3e27ab62..0b10f982a 100644
--- a/src/ATen/native/xpu/sycl/RepeatKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
@@ -51,11 +51,7 @@ static void compute_xpu(
     return;
 
   auto kfn = RepeatInterleaveKernelFunctor<index_t>(
-      repeat_ptr,
-      cumsum_ptr,
-      result_ptr,
-      size,
-      result_size);
+      repeat_ptr, cumsum_ptr, result_ptr, size, result_size);
 
   int64_t wg_size = syclMaxWorkGroupSize(kfn);
   int64_t local_range = size < wg_size ? size : wg_size;
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 313a2e8b3..a1995c5af 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -53,7 +53,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_compare_cpu_to_sparse_xpu_float32",
     "test_errors_dot_xpu",
     "test_errors_kthvalue_xpu",
-    "test_errors_masked_select_xpu",
     "test_errors_sparse_mul_layout0_xpu",
     "test_errors_sparse_mul_layout1_xpu",
     "test_errors_sparse_mul_layout2_xpu",
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index e8c0dc55b..9b3ee286b 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -80,6 +80,7 @@
     "index_fill",
     "index_put",
     "index_select",
+    "masked_select",
     "isin",
     "isnan",
     "le",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 92ab06e74..75a40f54f 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -112,6 +112,8 @@ supported:
   - index_fill_.int_Tensor
   - index_select
   - index_select.out
+  - masked_select
+  - masked_select.out
   - gcd
   - gcd.out
   - gcd_

From eb223b2e29bb3e35fef34fda7775c31f7cc5101f Mon Sep 17 00:00:00 2001
From: hjhee <hjhee@users.noreply.github.com>
Date: Sat, 27 Jul 2024 16:46:57 +0800
Subject: [PATCH 10/17] erfinv: Sharing erfinv scalar functor to align
 precision with other PyTorch backends. (#634)

Use existing `calc_erfinv` implementation from `<ATen/native/Math.h>`
reduces the discrepancy between CPU MKL and XPU implementations of the
erfinv for specific test cases in float64
---
 .../xpu/sycl/UnarySpecialOpsKernels.cpp       | 65 ++-----------------
 test/xpu/run_test_with_skip.py                | 20 ------
 2 files changed, 7 insertions(+), 78 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index b7d0b8974..e2303634e 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -79,67 +79,16 @@ void erfc_kernel(TensorIteratorBase& iter) {
 
 template <typename scalar_t>
 struct ErfinvFunctor {
-  using opmath_type = at::opmath_type<scalar_t>;
-
   scalar_t operator()(scalar_t in) const {
-    scalar_t out;
-    opmath_type z, num, dem;
-
-    auto x = static_cast<opmath_type>(in);
-    if (std::fabs(x) > 1.0f) {
-      out = static_cast<scalar_t>(NAN);
-      return out;
-    }
-    if (std::fabs(x) == 1.0f) {
-      out = static_cast<scalar_t>(
-          (std::copysign(1.0, static_cast<double>(x))) *
-          (std::numeric_limits<double>::infinity()));
-      return out;
-    }
-    if (std::fabs(x) <= 0.7f) {
-      z = x * x;
-      num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]);
-      dem =
-          ((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z +
-           static_cast<opmath_type>(1.0));
-      out = x * num / dem;
-    } else {
-      z = static_cast<opmath_type>(
-          std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0)));
-      num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0];
-      dem = (d_[1] * z + d_[0]) * z + static_cast<opmath_type>(1.0);
-      out = static_cast<scalar_t>(
-          static_cast<opmath_type>(std::copysign(1.0, static_cast<double>(x))) *
-          num / dem);
-    }
-    out = out -
-        static_cast<scalar_t>(
-              (std::erf(static_cast<double>(out)) - x) /
-              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
-    out = out -
-        static_cast<scalar_t>(
-              (std::erf(static_cast<double>(out)) - x) /
-              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
-    return out;
+    return calc_erfinv(in);
   }
+};
 
-  static constexpr double PI_f64_ = 3.14159265358979323846;
-  static constexpr std::array<opmath_type, 4> a_ = {
-      0.886226899,
-      -1.645349621,
-      0.914624893,
-      -0.140543331};
-  static constexpr std::array<opmath_type, 4> b_ = {
-      -2.118377725,
-      1.442710462,
-      -0.329097515,
-      0.012229801};
-  static constexpr std::array<opmath_type, 4> c_ = {
-      -1.970840454,
-      -1.624906493,
-      3.429567803,
-      1.641345311};
-  static constexpr std::array<opmath_type, 2> d_ = {3.543889200, 1.637067800};
+template <>
+struct ErfinvFunctor<c10::Half> {
+  c10::Half operator()(c10::Half in) const {
+    return calc_erfinv(float(in));
+  }
 };
 
 void erfinv_kernel(TensorIteratorBase& iter) {
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index a1995c5af..719af3ca4 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1551,26 +1551,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # Relative difference: 6.156719153309558e-06 (up to 1e-06 allowed)
     "test_log1p_complex_xpu_complex64",
 
-    # CPU MKL::erfinv vs XPU impl. At most 6.e-06
-    # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed)
-    "test_reference_numerics_large__refs_erfinv_xpu_float64",
-    # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed)
-    "test_reference_numerics_large_erfinv_xpu_float64",
-    # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed)
-    "test_reference_numerics_normal__refs_erfinv_xpu_float64",
-    # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed)
-    "test_reference_numerics_normal_erfinv_xpu_float64",
-    # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed)
-    "test_reference_numerics_small__refs_erfinv_xpu_float64",
-    # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed)
-    # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed)
-    "test_reference_numerics_small_erfinv_xpu_float64",
-
     # Issue: https://github.com/intel/torch-xpu-ops/issues/622
     # Mismatched elements: 8 / 943593 (0.0%)
     # Greatest absolute difference: inf at index (9, 860) (up to 0.001 allowed)

From 0f6f4d394861c0f3d28e6b8a1e3ef158dc057d2c Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Mon, 29 Jul 2024 10:40:47 +0800
Subject: [PATCH 11/17] Add aten::replication_pad1d/2d/3d and backward (#600)

Signed-off-by: Ma, Jing1 <jing1.ma@intel.com>
Signed-off-by: majing <Jing1.Ma@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/ReplicationPadding.cpp    | 316 ++++++++
 src/ATen/native/xpu/XPUFallback.template      |   6 -
 .../xpu/sycl/ReplicationPaddingKernels.cpp    | 743 ++++++++++++++++++
 .../xpu/sycl/ReplicationPaddingKernels.h      |  40 +
 test/xpu/extended/run_test_with_skip.py       |   6 +
 test/xpu/xpu_test_utils.py                    |   2 +-
 yaml/xpu_functions.yaml                       |  12 +
 7 files changed, 1118 insertions(+), 7 deletions(-)
 create mode 100644 src/ATen/native/xpu/ReplicationPadding.cpp
 create mode 100644 src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h

diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp
new file mode 100644
index 000000000..b4f6d3272
--- /dev/null
+++ b/src/ATen/native/xpu/ReplicationPadding.cpp
@@ -0,0 +1,316 @@
+#include <ATen/Context.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/Padding.h>
+#include <ATen/native/xpu/sycl/ReplicationPaddingKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+#include <comm/RegisterUtils.h>
+
+namespace at {
+
+void replication_pad1d_meta(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef paddingSize) {
+  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
+
+  int64_t dimw = 1;
+  int64_t dimslices = 0;
+  int64_t nbatch = 1;
+
+  int64_t pad_l = paddingSize[0];
+  int64_t pad_r = paddingSize[1];
+
+  at::native::padding::check_valid_input<1>(input, paddingSize);
+
+  if (input.ndimension() == 3) {
+    nbatch = input.size(0);
+    dimw++;
+    dimslices++;
+  }
+
+  /* sizes */
+  int64_t nslices = input.size(dimslices);
+  int64_t iwidth = input.size(dimw);
+  int64_t owidth = iwidth + pad_l + pad_r;
+
+  TORCH_CHECK(owidth >= 1,
+      "input (W: ", iwidth, ") is too small."
+      " Calculated output W: ", owidth);
+
+  if (output.defined()) {
+    if (input.ndimension() == 2) {
+      xpu::resize_out(output, {nslices, owidth}, {}, input.options());
+    } else {
+      xpu::resize_out(output, {nbatch, nslices, owidth}, {}, input.options());
+    }
+  } else {
+    if (input.ndimension() == 2) {
+      output = xpu::create_out({nslices, owidth}, {}, input.options());
+    } else {
+      output = xpu::create_out({nbatch, nslices, owidth}, {}, input.options());
+    }
+  }
+}
+
+void replication_pad1d_backward_meta(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef paddingSize) {
+  int64_t dimw = 1;
+  TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2");
+  int64_t pad_l = paddingSize[0];
+  int64_t pad_r = paddingSize[1];
+
+  if (input.ndimension() == 3) {
+    dimw++;
+  }
+
+  /* sizes */
+  int64_t iwidth = input.size(dimw);
+  int64_t owidth  = iwidth + pad_l + pad_r;
+
+  TORCH_CHECK(owidth == grad_output.size(dimw),
+      "grad_output width unexpected. Expected: ", owidth,
+      " Got: ", grad_output.size(dimw));
+
+  if (grad_input.defined()) {
+    xpu::resize_out(grad_input, input.sizes(), {}, input.options());
+  } else {
+    grad_input = xpu::create_out(input.sizes(), {}, input.options());
+  }
+}
+
+void replication_pad2d_meta(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef paddingSize) {
+  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
+  int64_t pad_l = paddingSize[0];
+  int64_t pad_r = paddingSize[1];
+  int64_t pad_t = paddingSize[2];
+  int64_t pad_b = paddingSize[3];
+  int64_t dimw = 2;
+  int64_t dimh = 1;
+  int64_t dimslices = 0;
+  int64_t nbatch = 1;
+
+  at::native::padding::check_valid_input<2>(input, paddingSize);
+
+  if (input.dim() == 4) {
+    nbatch = input.size(0);
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  int64_t nslices = input.size(dimslices);
+  int64_t iheight = input.size(dimh);
+  int64_t iwidth = input.size(dimw);
+  int64_t oheight = iheight + pad_t + pad_b;
+  int64_t owidth  = iwidth + pad_l + pad_r;
+
+  TORCH_CHECK(owidth >= 1 || oheight >= 1,
+      "input (H: ", iheight, ", W: ", iwidth, " ) is too small."
+      " Calculated output H: ", oheight, " W: ", owidth);
+
+  if (output.defined()) {
+    if (input.dim() == 3) {
+      xpu::resize_out(
+          output,
+          {nslices, oheight, owidth}, {}, input.options());
+    } else {
+      xpu::resize_out(
+          output, {nbatch, nslices, oheight, owidth}, {}, input.options());
+    }
+  } else {
+    if (input.dim() == 3) {
+      output = xpu::create_out(
+          {nslices, oheight, owidth}, {}, input.options());
+    } else {
+      output = xpu::create_out(
+          {nbatch, nslices, oheight, owidth}, {}, input.options());
+    }
+  }
+}
+
+void replication_pad3d_meta(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef paddingSize) {
+  TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6");
+  int64_t pleft = paddingSize[0];
+  int64_t pright = paddingSize[1];
+  int64_t ptop = paddingSize[2];
+  int64_t pbottom = paddingSize[3];
+  int64_t pfront = paddingSize[4];
+  int64_t pback = paddingSize[5];
+  int64_t dimw = 3;
+  int64_t dimh = 2;
+  int64_t dimd = 1;
+  int64_t dimslices = 0;
+  int64_t nbatch = 1;
+
+  at::native::padding::check_valid_input<3>(input, paddingSize);
+
+  if (input.dim() == 5) {
+    nbatch = input.size(0);
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  int64_t nslices = input.size(dimslices);
+  int64_t idepth = input.size(dimd);
+  int64_t iheight = input.size(dimh);
+  int64_t iwidth = input.size(dimw);
+  int64_t odepth = idepth + pfront + pback;
+  int64_t oheight = iheight + ptop + pbottom;
+  int64_t owidth  = iwidth + pleft + pright;
+
+  TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
+      "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
+      ") is too small."
+      " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
+
+  if (output.defined()) {
+    if (input.dim() == 4) {
+      xpu::resize_out(
+          output,
+          {nslices, odepth, oheight, owidth}, {}, input.options());
+    } else {
+      xpu::resize_out(
+          output, {nbatch, nslices, odepth, oheight, owidth}, {}, input.options());
+    }
+  } else {
+    if (input.dim() == 4) {
+      output = xpu::create_out(
+          {nslices, odepth, oheight, owidth}, {}, input.options());
+    } else {
+      output = xpu::create_out(
+          {nbatch, nslices, odepth, oheight, owidth}, {}, input.options());
+    }
+  }
+}
+
+Tensor XPUNativeFunctions::replication_pad1d(
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor output;
+  replication_pad1d_meta(output, input, padding);
+  native::xpu::replication_pad1d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::replication_pad1d_out(
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& output) {
+  replication_pad1d_meta(output, input, padding);
+  native::xpu::replication_pad1d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor XPUNativeFunctions::replication_pad1d_backward(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor grad_input;
+  replication_pad1d_backward_meta(grad_input, grad_output, input, padding);
+  native::xpu::replication_pad1d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor& XPUNativeFunctions::replication_pad1d_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  replication_pad1d_backward_meta(grad_input, grad_output, input, padding);
+  native::xpu::replication_pad1d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor& XPUNativeFunctions::replication_pad2d_out(
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& output) {
+  replication_pad2d_meta(output, input, padding);
+  native::xpu::replication_pad2d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor XPUNativeFunctions::replication_pad2d(
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor output;
+  replication_pad2d_meta(output, input, padding);
+  native::xpu::replication_pad2d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::replication_pad2d_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  native::xpu::replication_pad2d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor XPUNativeFunctions::replication_pad2d_backward(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  native::xpu::replication_pad2d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor XPUNativeFunctions::replication_pad3d(
+    const Tensor& input,
+    IntArrayRef padding) {
+  Tensor output;
+  replication_pad3d_meta(output, input, padding);
+  native::xpu::replication_pad3d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::replication_pad3d_out(
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& output) {
+  replication_pad3d_meta(output, input, padding);
+  native::xpu::replication_pad3d_kernel(output, input, padding);
+  return output;
+}
+
+Tensor XPUNativeFunctions::replication_pad3d_backward(
+    const Tensor& grad_output,
+    const Tensor& input,
+    at::IntArrayRef padding) {
+  auto grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  native::xpu::replication_pad3d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor& XPUNativeFunctions::replication_pad3d_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  native::xpu::replication_pad3d_backward_kernel(
+      grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 7bcc67ca5..3f2064653 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -246,10 +246,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "prod",
     "prod.int_out",
     "put_",
-    "replication_pad1d_backward.grad_input",
-    "replication_pad1d.out",
-    "replication_pad2d_backward",
-    "replication_pad2d.out",
     "round.decimals_out",
     "round.out",
     "rrelu_with_noise",
@@ -318,8 +314,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "vdot",
     "xlogy.OutTensor",
     "_upsample_bicubic2d_aa.out",
-    "replication_pad3d.out",
-    "replication_pad3d_backward",
   };
   for (auto& op_name : fallback_list) {
     m.impl(
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
new file mode 100644
index 000000000..bb42aa327
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.cpp
@@ -0,0 +1,743 @@
+#pragma clang diagnostic push
+#pragma GCC diagnostic push
+// Avoid SYCL compiler return-type error
+#pragma clang diagnostic ignored "-Wreturn-type"
+#pragma GCC diagnostic ignored "-Wreturn-type"
+
+#include <ATen/ATen.h>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/ceil_div.h>
+#include <ATen/native/xpu/sycl/Atomics.h>
+#include <comm/Runtime.h>
+#include <comm/SYCLContext.h>
+
+namespace at::native::xpu {
+
+inline int imin(int a, int b) {
+  return a > b ? b : a;
+}
+inline int imax(int a, int b) {
+  return a > b ? a : b;
+}
+
+template <typename scalar_t, typename F>
+struct ParallelReplicationPad1dKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    auto output_id = item.get_global_id(2);
+    if (output_id < output_plane_size_) {
+      int64_t output_x = output_id % output_.size(2);
+      int64_t i_start_x = imax(0, -pad_left_);
+      int64_t o_start_x = imax(0, pad_left_);
+      int64_t input_x =
+          imin(imax(pad_left_, output_x), input_.size(2) + pad_left_ - 1) -
+          o_start_x + i_start_x;
+
+      f_(input_, output_, item.get_group(1), item.get_group(0), output_x, input_x);
+    }
+  }
+  ParallelReplicationPad1dKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 3> input,
+      PackedTensorAccessor64<scalar_t, 3> output,
+      int64_t pad_left,
+      int64_t pad_right,
+      const F f,
+      int64_t output_plane_size)
+      : input_(input),
+        output_(output),
+        pad_left_(pad_left),
+        pad_right_(pad_right),
+        f_(f),
+        output_plane_size_(output_plane_size) {}
+
+ private:
+  PackedTensorAccessor64<scalar_t, 3> input_;
+  PackedTensorAccessor64<scalar_t, 3> output_;
+  int64_t pad_left_;
+  int64_t pad_right_;
+  const F f_;
+  int64_t output_plane_size_;
+};
+
+template <typename scalar_t, typename F>
+void parallel_replication_pad1d(
+    PackedTensorAccessor64<scalar_t, 3> input,
+    PackedTensorAccessor64<scalar_t, 3> output,
+    int64_t pad_left,
+    int64_t pad_right,
+    const F& f) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t output_plane_size = output.size(2);
+
+  ParallelReplicationPad1dKernelFunctor<scalar_t, F> kfn(
+      input, output, pad_left, pad_right, f, output_plane_size);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size);
+  int64_t nplane = output.size(1);
+  int64_t nbatch = output.size(0);
+
+  sycl_kernel_submit(
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
+      queue,
+      kfn);
+}
+
+template <typename scalar_t>
+struct ReplicationPad1dForwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 3> input,
+      PackedTensorAccessor64<scalar_t, 3> output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_x,
+      int64_t intput_x) const {
+    auto value_to_copy = input[batch][plane][intput_x];
+    output[batch][plane][output_x] = value_to_copy;
+  }
+};
+
+template <typename scalar_t>
+void replication_pad1d_forward_template(
+    PackedTensorAccessor64<scalar_t, 3> input,
+    PackedTensorAccessor64<scalar_t, 3> output,
+    int64_t pad_left,
+    int64_t pad_right) {
+  ReplicationPad1dForwardFunctor<scalar_t> f;
+  parallel_replication_pad1d(input, output, pad_left, pad_right, f);
+}
+
+template <typename scalar_t>
+struct ReplicationPad1dBackwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 3> grad_input,
+      PackedTensorAccessor64<scalar_t, 3> grad_output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_x,
+      int64_t intput_x) const {
+    auto value_to_add = grad_output[batch][plane][output_x];
+    auto target =
+        (sycl_global_ptr<scalar_t>)&grad_input[batch][plane][intput_x];
+    atomicAdd(target, value_to_add);
+  }
+};
+
+template <typename scalar_t>
+void replication_pad1d_backward_template(
+    PackedTensorAccessor64<scalar_t, 3> grad_input,
+    PackedTensorAccessor64<scalar_t, 3> grad_output,
+    int64_t pad_left,
+    int64_t pad_right) {
+  ReplicationPad1dBackwardFunctor<scalar_t> f;
+  parallel_replication_pad1d(grad_input, grad_output, pad_left, pad_right, f);
+}
+
+template <typename scalar_t, typename F>
+struct ParallelReplicationPad2dKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    const int output_id = item.get_global_id(2);
+    const int batch = item.get_global_id(0);
+    const int plane = item.get_global_id(1);
+
+    if (output_id < output_.size(2) * output_.size(3)) {
+      const int output_x = output_id / output_.size(3);  // height
+      const int output_y = output_id % output_.size(3);  // width
+
+      const int iStartX = imax(0, -padT_);
+      const int iStartY = imax(0, -padL_);
+      const int oStartX = imax(0, padT_);
+      const int oStartY = imax(0, padL_);
+
+      const int input_x = imin(imax(padT_, output_x), input_.size(2) + padT_ - 1) - oStartX + iStartX;
+      const int input_y = imin(imax(padL_, output_y), input_.size(3) + padL_ - 1) - oStartY + iStartY;
+
+      f_(input_, output_, batch, plane, input_x, input_y, output_x, output_y);
+    }
+}
+  ParallelReplicationPad2dKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 4> input,
+      PackedTensorAccessor64<scalar_t, 4> output,
+      int64_t padT,
+      int64_t padL,
+      const F f)
+      : input_(input),
+        output_(output),
+        padT_(padT),
+        padL_(padL),
+        f_(f) {}
+
+ private:
+  PackedTensorAccessor64<scalar_t, 4> input_;
+  PackedTensorAccessor64<scalar_t, 4> output_;
+  int64_t padT_;
+  int64_t padL_;
+  const F f_;
+};
+
+template <typename scalar_t, typename F>
+void parallel_replication_pad2d(
+    PackedTensorAccessor64<scalar_t, 4> input,
+    PackedTensorAccessor64<scalar_t, 4> output,
+    const int padT,
+    const int padL,
+    const F& f) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t output_plane_size = output.size(2) * output.size(3);
+
+  ParallelReplicationPad2dKernelFunctor<scalar_t, F> kfn(
+      input, output, padT, padL, f);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size);
+  int64_t nplane = output.size(1);
+  int64_t nbatch = output.size(0);
+
+  sycl_kernel_submit(
+      sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+      sycl::range<3>(1, 1, work_group_size),
+      queue,
+      kfn);
+}
+
+template <typename scalar_t>
+struct ReplicationPad2dForwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 4> input,
+      PackedTensorAccessor64<scalar_t, 4> output,
+      int64_t batch,
+      int64_t plane,
+      int64_t input_x,
+      int64_t input_y,
+      int64_t output_x,
+      int64_t output_y) const {
+    scalar_t valueToCopy = input[batch][plane][input_x][input_y];
+    output[batch][plane][output_x][output_y] = valueToCopy;
+  }
+};
+
+template <typename scalar_t>
+void replication_pad2d_forward_template(
+    PackedTensorAccessor64<scalar_t, 4> input,
+    PackedTensorAccessor64<scalar_t, 4> output,
+    int64_t padT,
+    int64_t padL) {
+  ReplicationPad2dForwardFunctor<scalar_t> f;
+  parallel_replication_pad2d(input, output, padT, padL, f);
+}
+
+template <typename scalar_t>
+struct ReplicationPad2dBackwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 4> grad_input,
+      PackedTensorAccessor64<scalar_t, 4> grad_output,
+      int64_t batch,
+      int64_t plane,
+      int64_t input_x,
+      int64_t input_y,
+      int64_t output_x,
+      int64_t output_y) const {
+    scalar_t valueToAdd = grad_output[batch][plane][output_x][output_y];
+    auto target =
+        (sycl_global_ptr<scalar_t>)&grad_input[batch][plane][input_x][input_y];
+    atomicAdd(target, valueToAdd);
+  }
+};
+
+template <typename scalar_t>
+void replication_pad2d_backward_template(
+    PackedTensorAccessor64<scalar_t, 4> grad_input,
+    PackedTensorAccessor64<scalar_t, 4> grad_output,
+    const int padT,
+    const int padL) {
+  ReplicationPad2dBackwardFunctor<scalar_t> f;
+  parallel_replication_pad2d(grad_input, grad_output, padT, padL, f);
+}
+
+template <typename scalar_t, typename F>
+struct ParallelReplicationPad3dKernelFunctor {
+  void operator()(sycl::nd_item<3> item) const {
+    auto output_id = item.get_global_id(2);
+    if (output_id < output_plane_size_) {
+      int64_t output_x = output_id % output_.size(4);
+      int64_t output_y = (output_id / output_.size(4)) % output_.size(3);
+      int64_t output_z = output_id / (output_.size(3) * output_.size(4));
+
+      int64_t i_start_x = imax(0, -pad_left_);
+      int64_t i_start_y = imax(0, -pad_top_);
+      int64_t i_start_z = imax(0, -pad_front_);
+      int64_t o_start_x = imax(0, pad_left_);
+      int64_t o_start_y = imax(0, pad_top_);
+      int64_t o_start_z = imax(0, pad_front_);
+
+      int64_t input_x =
+          imin(imax(pad_left_, output_x), input_.size(4) + pad_left_ - 1) -
+          o_start_x + i_start_x;
+      int64_t input_y =
+          imin(imax(pad_top_, output_y), input_.size(3) + pad_top_ - 1) - o_start_y +
+          i_start_y;
+      int64_t input_z =
+          imin(imax(pad_front_, output_z), input_.size(2) + pad_front_ - 1) -
+          o_start_z + i_start_z;
+
+      f_(input_,
+        output_,
+        item.get_group(1),
+        item.get_group(0),
+        output_z,
+        output_y,
+        output_x,
+        input_z,
+        input_y,
+        input_x);
+    }
+  }
+  ParallelReplicationPad3dKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 5> input,
+      PackedTensorAccessor64<scalar_t, 5> output,
+      int64_t pad_left,
+      int64_t pad_top,
+      int64_t pad_front,
+      const F f,
+      int64_t output_plane_size)
+      : input_(input),
+        output_(output),
+        pad_left_(pad_left),
+        pad_top_(pad_top),
+        pad_front_(pad_front),
+        f_(f),
+        output_plane_size_(output_plane_size) {}
+
+ private:
+  PackedTensorAccessor64<scalar_t, 5> input_;
+  PackedTensorAccessor64<scalar_t, 5> output_;
+  int64_t pad_left_;
+  int64_t pad_top_;
+  int64_t pad_front_;
+  const F f_;
+  int64_t output_plane_size_;
+};
+
+template <typename scalar_t, typename F>
+void parallel_replication_pad3d(
+    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front,
+    const F& f) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t output_plane_size = output.size(2) * output.size(3) * output.size(4);
+
+  ParallelReplicationPad3dKernelFunctor<scalar_t, F> kfn(
+      input, output, pad_left, pad_top, pad_front, f, output_plane_size);
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t work_group_num = at::ceil_div(output_plane_size, work_group_size);
+  int64_t nplane = output.size(1);
+  int64_t nbatch = output.size(0);
+
+  sycl_kernel_submit(
+          sycl::range<3>(nbatch, nplane, work_group_size * work_group_num),
+          sycl::range<3>(1, 1, work_group_size), queue,
+      kfn);
+}
+
+template <typename scalar_t>
+struct ReplicationPad3dForwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 5> input,
+      PackedTensorAccessor64<scalar_t, 5> output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_z,
+      int64_t output_y,
+      int64_t output_x,
+      int64_t intput_z,
+      int64_t intput_y,
+      int64_t intput_x) const {
+    auto value_to_copy = input[batch][plane][intput_z][intput_y][intput_x];
+    output[batch][plane][output_z][output_y][output_x] = value_to_copy;
+  }
+};
+
+template <typename scalar_t>
+void replication_pad3d_forward_template(
+    PackedTensorAccessor64<scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front) {
+  ReplicationPad3dForwardFunctor<scalar_t> f;
+  parallel_replication_pad3d(input, output, pad_left, pad_top, pad_front, f);
+}
+
+template <typename scalar_t>
+struct ReplicationPad3dBackwardFunctor {
+  void operator()(
+      PackedTensorAccessor64<scalar_t, 5> grad_input,
+      PackedTensorAccessor64<scalar_t, 5> grad_output,
+      int64_t plane,
+      int64_t batch,
+      int64_t output_z,
+      int64_t output_y,
+      int64_t output_x,
+      int64_t intput_z,
+      int64_t intput_y,
+      int64_t intput_x) const {
+    auto value_to_add = grad_output[batch][plane][output_z][output_y][output_x];
+    auto target =
+        (sycl_global_ptr<scalar_t>)&grad_input[batch][plane][intput_z][intput_y][intput_x];
+    atomicAdd(target, value_to_add);
+  }
+};
+
+template <typename scalar_t>
+void replication_pad3d_backward_template(
+    PackedTensorAccessor64<scalar_t, 5> grad_input,
+    PackedTensorAccessor64<scalar_t, 5> grad_output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front) {
+  ReplicationPad3dBackwardFunctor<scalar_t> f;
+  parallel_replication_pad3d(
+      grad_input, grad_output, pad_left, pad_top, pad_front, f);
+}
+
+void replication_pad1d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  int64_t pad_left = padding[0];
+  int64_t pad_right = padding[1];
+  int64_t num_input_dims = input.dim();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "replication_pad1d_xpu", [&] {
+        auto input_ = input;
+        auto output_ = output;
+        if (num_input_dims == 2) {
+          input_ = input.unsqueeze(0);
+          output_ = output.unsqueeze(0);
+        }
+
+        auto input_packed = input_.packed_accessor64<scalar_t, 3>();
+        auto output_packed = output_.packed_accessor64<scalar_t, 3>();
+
+        replication_pad1d_forward_template<scalar_t>(
+            input_packed, output_packed, pad_left, pad_right);
+      });
+}
+
+void replication_pad1d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad1d_backward_xpu");
+
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
+  TORCH_CHECK(grad_output.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports output tensors with less than 2^63 - 1 elements");
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  grad_input.zero_();
+
+  int pad_left = padding[0];
+  int pad_right = padding[1];
+  int num_input_dims = input.ndimension();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      input.scalar_type(),
+      "replication_pad1d_backward_xpu",
+      [&] {
+        auto grad_input_ = grad_input;
+        auto grad_output_ = grad_output;
+        if (num_input_dims == 2) {
+          grad_input_ = grad_input.unsqueeze(0);
+          grad_output_ = grad_output.unsqueeze(0);
+        }
+        auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 3>();
+        auto grad_output_packed = grad_output_.packed_accessor64<scalar_t, 3>();
+
+        replication_pad1d_backward_template<scalar_t>(
+            grad_input_packed, grad_output_packed, pad_left, pad_right);
+      });
+}
+
+void replication_pad2d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  TORCH_CHECK(canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  if (input.numel() == 0) {
+    return;
+  }
+  const auto padL = padding[0];
+  const auto padT = padding[2];
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "replication_pad2d_xpu", [&] {
+      Tensor input_ = input;
+      Tensor output_ = output;
+      if (input.dim() == 3) {
+        input_ = input.unsqueeze(0);
+        output_ = output.unsqueeze(0);
+      }
+      auto devInput = input_.packed_accessor64<scalar_t, 4>();
+      auto devOutput = output_.packed_accessor64<scalar_t, 4>();
+      replication_pad2d_forward_template<scalar_t>(devInput, devOutput, padT, padL);
+    }
+  );
+}
+
+void replication_pad2d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad2d_backward_xpu");
+  TORCH_CHECK(canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(canUse32BitIndexMath(grad_output),
+      "output gradient tensor must fit into 32-bit index math");
+  TORCH_CHECK(padding.size() == 4, "padding Size is expected to be 4");
+
+  const auto padL = padding[0];
+  const auto padR = padding[1];
+  const auto padT = padding[2];
+  const auto padB = padding[3];
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = input.dim();
+  if (numInputDims == 4) {
+    dimh++;
+    dimw++;
+  }
+  const auto iheight = input.size(dimh);
+  const auto iwidth = input.size(dimw);
+  const auto oheight = iheight + padT + padB;
+  const auto owidth  = iwidth + padL + padR;
+
+  TORCH_CHECK(owidth == grad_output.size(dimw),
+      "grad_output width unexpected. Expected: ", owidth, ", Got: ",
+      grad_output.size(dimw));
+  TORCH_CHECK(oheight == grad_output.size(dimh),
+      "grad_output height unexpected. Expected: ", oheight, ", Got: ",
+      grad_output.size(dimh));
+
+  grad_input.resize_as_(input);
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  grad_input.zero_();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "replication_pad2d_backward_xpu", [&] {
+
+        auto grad_input_ = grad_input;
+        auto grad_output_ = grad_output;
+        if (numInputDims == 3) {
+          grad_input_ = grad_input.unsqueeze(0);
+          grad_output_ = grad_output.unsqueeze(0);
+        }
+        auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 4>();
+        auto grad_output_packed = grad_output_.packed_accessor64<scalar_t, 4>();
+
+        replication_pad2d_backward_template<scalar_t>(grad_input_packed, grad_output_packed, padT, padL);
+      }
+  );
+}
+
+void replication_pad3d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  if (input.numel() == 0) {
+    return;
+  }
+  int64_t pad_left = padding[0];
+  int64_t pad_top = padding[2];
+  int64_t pad_front = padding[4];
+
+  int64_t num_input_dims = input.dim();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "replication_pad3d_xpu", [&] {
+        auto input_ = input;
+        auto output_ = output;
+        if (num_input_dims == 4) {
+          input_ = input.unsqueeze(0);
+          output_ = output.unsqueeze(0);
+        }
+
+        auto input_packed = input_.packed_accessor64<scalar_t, 5>();
+        auto output_packed = output_.packed_accessor64<scalar_t, 5>();
+
+        replication_pad3d_forward_template<scalar_t>(
+            input_packed, output_packed, pad_left, pad_top, pad_front);
+      });
+}
+
+static inline void shapeAndGradOutputCheck3d(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t pad_left,
+    int64_t pad_right,
+    int64_t pad_top,
+    int64_t pad_bottom,
+    int64_t pad_front,
+    int64_t pad_back) {
+  TORCH_CHECK(canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  int64_t num_input_dims = input.dim();
+
+  bool valid_dims =
+      input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
+  TORCH_CHECK(
+      (num_input_dims == 4 && valid_dims) ||
+          (num_input_dims == 5 && valid_dims && input.size(4) != 0),
+      "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int plane_dim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+  if (num_input_dims == 5) {
+    plane_dim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int64_t num_planes = input.size(plane_dim);
+  int64_t idepth = input.size(dimd);
+  int64_t iheight = input.size(dimh);
+  int64_t iwidth = input.size(dimw);
+  int64_t odepth = idepth + pad_front + pad_back;
+  int64_t oheight = iheight + pad_top + pad_bottom;
+  int64_t owidth = iwidth + pad_left + pad_right;
+  TORCH_CHECK(
+      owidth >= 1 || oheight >= 1 || odepth >= 1,
+      "input (D: ",
+      idepth,
+      " H: ",
+      iheight,
+      ", W: ",
+      iwidth,
+      ") is too small."
+      " Calculated output D: ",
+      odepth,
+      " H: ",
+      oheight,
+      " W: ",
+      owidth);
+
+  TORCH_CHECK(canUse32BitIndexMath(grad_output),
+      "output gradient tensor must fit into 32-bit index math");
+
+  TORCH_CHECK(
+      num_planes == grad_output.size(plane_dim),
+      "grad_output width unexpected. Expected: ",
+      num_planes,
+      ", Got: ",
+      grad_output.size(plane_dim));
+  TORCH_CHECK(
+      owidth == grad_output.size(dimw),
+      "grad_output width unexpected. Expected: ",
+      owidth,
+      ", Got: ",
+      grad_output.size(dimw));
+  TORCH_CHECK(
+      oheight == grad_output.size(dimh),
+      "grad_output height unexpected. Expected: ",
+      oheight,
+      ", Got: ",
+      grad_output.size(dimh));
+  TORCH_CHECK(
+      odepth == grad_output.size(dimd),
+      "grad_output depth unexpected. Expected: ",
+      odepth,
+      ", Got: ",
+      grad_output.size(dimd));
+}
+
+void replication_pad3d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad3d_backward_xpu");
+  TORCH_CHECK(padding.size() == 6, "padding Size is expected to be 6");
+
+  int pad_left = padding[0];
+  int pad_right = padding[1];
+  int pad_top = padding[2];
+  int pad_bottom = padding[3];
+  int pad_front = padding[4];
+  int pad_back = padding[5];
+  shapeAndGradOutputCheck3d(
+      input,
+      grad_output,
+      pad_left,
+      pad_right,
+      pad_top,
+      pad_bottom,
+      pad_front,
+      pad_back);
+
+  grad_input.resize_as_(input);
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  grad_input.zero_();
+  int num_input_dims = input.dim();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      input.scalar_type(),
+      "replication_pad3d_backward_xpu",
+      [&] {
+        auto grad_input_ = grad_input;
+        auto grad_output_ = grad_output;
+        if (num_input_dims == 4) {
+          grad_input_ = grad_input.unsqueeze(0);
+          grad_output_ = grad_output.unsqueeze(0);
+        }
+        auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 5>();
+        auto grad_output_packed = grad_output_.packed_accessor64<scalar_t, 5>();
+        replication_pad3d_backward_template<scalar_t>(
+            grad_input_packed,
+            grad_output_packed,
+            pad_left,
+            pad_top,
+            pad_front);
+      });
+}
+
+} // namespace at::native::xpu
+
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
new file mode 100644
index 000000000..85d19d5bf
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ReplicationPaddingKernels.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+
+void replication_pad1d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+void replication_pad1d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+void replication_pad2d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+void replication_pad2d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+void replication_pad3d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+void replication_pad3d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 37f4ab220..6f8fe8d3a 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -170,6 +170,12 @@
     # https://github.com/pytorch/pytorch/issues/130916
     "test_compare_cpu_histogram_xpu_float32",
     "test_compare_cpu_histogram_xpu_float64",
+
+    # Precision error.
+    # Mismatched elements: 2 / 125 (1.6%)
+    # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed)
+    # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed)
+    "test_compare_cpu_cumprod_xpu_bfloat16",
 )
 
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 9b3ee286b..2a36ddfb0 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -59,7 +59,7 @@
     "clamp_min",
     "clone",
     "copy",
-    "cumprod"
+    "cumprod",
     "cumsum",
     "equal",
     "eq",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 75a40f54f..77d77d4f0 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -564,6 +564,18 @@ supported:
   - reflection_pad3d.out
   - reflection_pad3d_backward
   - reflection_pad3d_backward.grad_input
+  - replication_pad1d
+  - replication_pad1d.out
+  - replication_pad1d_backward
+  - replication_pad1d_backward.grad_input
+  - replication_pad2d
+  - replication_pad2d.out
+  - replication_pad2d_backward
+  - replication_pad2d_backward.grad_input
+  - replication_pad3d
+  - replication_pad3d.out
+  - replication_pad3d_backward
+  - replication_pad3d_backward.grad_input
   - native_group_norm
   - native_group_norm_backward
   - elu

From 0608225e2b9953a7dc8c20d57cb3b100fc3140bd Mon Sep 17 00:00:00 2001
From: Stonepia <tong.su@intel.com>
Date: Mon, 29 Jul 2024 11:20:03 +0800
Subject: [PATCH 12/17] Use safe API to access Tensor data pointer (#655)

The un-templated data_ptr() does not check about the type check and
storage_initialized() check. Thus, it may introduce potential bugs.

By using the template data_ptr, this could throw an error like below:

```
torch._dynamo.exc.TorchRuntimeError: Failed running call_function torchvision.roi_align(*(FakeTensor(..., device='xpu:0', size=(1, 1024, 50, 75), dtype=torch.bfloat16), FakeTensor(..., device='xpu:0', size=(1000, 5), dtype=torch.bfloat16), 0.0625, 14, 14, 0, True), **{}):
The tensor has a non-zero number of elements, but its data is not allocated yet. Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
```
---
 src/ATen/native/xpu/NMS.cpp                   |  5 ++--
 src/ATen/native/xpu/sycl/MultiTensorApply.h   | 25 +++++++++++--------
 src/ATen/native/xpu/sycl/NMSKernel.cpp        |  4 +--
 src/ATen/native/xpu/sycl/Shape.cpp            |  4 +--
 src/ATen/native/xpu/sycl/SoftMaxKernels.cpp   | 10 ++++----
 .../native/xpu/sycl/TriangularOpsKernels.cpp  |  4 +--
 6 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/ATen/native/xpu/NMS.cpp b/src/ATen/native/xpu/NMS.cpp
index 37ed19d50..ea1ac4e9c 100644
--- a/src/ATen/native/xpu/NMS.cpp
+++ b/src/ATen/native/xpu/NMS.cpp
@@ -48,14 +48,15 @@ Tensor nms(const Tensor& dets, const Tensor& scores, double iou_threshold_) {
   auto mask = nms_kernel(dets_sorted, iou_threshold);
 
   at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr();
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.mutable_data_ptr();
 
   std::vector<unsigned long long> remv(col_blocks);
   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
 
   at::Tensor keep =
       at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = (int64_t*)keep.data_ptr();
+  int64_t* keep_out = keep.mutable_data_ptr<int64_t>();
 
   int num_to_keep = 0;
   for (int i = 0; i < dets_num; i++) {
diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h
index 0817e40be..bb1abe277 100644
--- a/src/ATen/native/xpu/sycl/MultiTensorApply.h
+++ b/src/ATen/native/xpu/sycl/MultiTensorApply.h
@@ -164,7 +164,7 @@ void multi_tensor_apply(
       tensor_lists[0][0].options().dtype(at::kByte));
   auto metaAddressInput =
       static_cast<TLMetaForAddressScalar<scalar_vals_t, depth>*>(
-          addressStorage.data_ptr());
+          addressStorage.mutable_data_ptr());
   TLMetaForAddressScalar<scalar_vals_t, depth>* tlAddress = nullptr;
 
   auto tlAddress_dptr = at::xpu::HostAlloc(
@@ -180,7 +180,7 @@ void multi_tensor_apply(
     tlAddress[t].scalar_vals = scalars[t].to<scalar_t>();
     totalWG += (numel + kChunkSize - 1) / kChunkSize;
     for (int d = 0; d < depth; ++d) {
-      tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr();
+      tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr();
     }
   }
 
@@ -196,7 +196,8 @@ void multi_tensor_apply(
   auto wgMetaStorage = at::empty(
       {(int)(sizeof(TLMetaForWG) * totalWG)},
       tensor_lists[0][0].options().dtype(at::kByte));
-  auto metaWGInput = static_cast<TLMetaForWG*>(wgMetaStorage.data_ptr());
+  auto metaWGInput =
+      static_cast<TLMetaForWG*>(wgMetaStorage.mutable_data_ptr());
   TLMetaForWG* tlWGMeta = nullptr;
 
   auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG);
@@ -248,7 +249,7 @@ void multi_tensor_apply(
       {(int)(sizeof(TLMetaForAddress<depth>) * n_tensors)},
       tensor_lists[0][0].options().dtype(at::kByte));
   auto metaAddressInput =
-      static_cast<TLMetaForAddress<depth>*>(addressStorage.data_ptr());
+      static_cast<TLMetaForAddress<depth>*>(addressStorage.mutable_data_ptr());
   TLMetaForAddress<depth>* tlAddress = nullptr;
 
   auto tlAddress_dptr =
@@ -262,7 +263,7 @@ void multi_tensor_apply(
     tlAddress[t].numel_to_tensor = numel;
     totalWG += (numel + kChunkSize - 1) / kChunkSize;
     for (int d = 0; d < depth; ++d) {
-      tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr();
+      tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr();
     }
   }
 
@@ -278,7 +279,8 @@ void multi_tensor_apply(
   auto wgMetaStorage = at::empty(
       {(int)(sizeof(TLMetaForWG) * totalWG)},
       tensor_lists[0][0].options().dtype(at::kByte));
-  auto metaWGInput = static_cast<TLMetaForWG*>(wgMetaStorage.data_ptr());
+  auto metaWGInput =
+      static_cast<TLMetaForWG*>(wgMetaStorage.mutable_data_ptr());
   TLMetaForWG* tlWGMeta = nullptr;
 
   auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG);
@@ -324,8 +326,8 @@ void multi_tensor_apply_for_fused_optimizer(
   auto addressStorage = at::empty(
       {(int)(sizeof(TLFusedMetaForAddress<depth>) * n_tensors)},
       tensor_lists[0][0].options().dtype(at::kByte));
-  auto metaFusedAddressInput =
-      static_cast<TLFusedMetaForAddress<depth>*>(addressStorage.data_ptr());
+  auto metaFusedAddressInput = static_cast<TLFusedMetaForAddress<depth>*>(
+      addressStorage.mutable_data_ptr());
   TLFusedMetaForAddress<depth>* tlAddress = nullptr;
 
   auto tlAddress_dptr =
@@ -337,10 +339,10 @@ void multi_tensor_apply_for_fused_optimizer(
   for (size_t t = 0; t < n_tensors; ++t) {
     auto numel = tensor_lists[0][t].numel();
     tlAddress[t].numel_to_tensor = numel;
-    tlAddress[t].state_steps_addresses = state_steps[t].data_ptr();
+    tlAddress[t].state_steps_addresses = state_steps[t].mutable_data_ptr();
     totalWG += (numel + kChunkSize - 1) / kChunkSize;
     for (int d = 0; d < depth; ++d) {
-      tlAddress[t].addresses[d] = tensor_lists[d][t].data_ptr();
+      tlAddress[t].addresses[d] = tensor_lists[d][t].mutable_data_ptr();
     }
   }
 
@@ -356,7 +358,8 @@ void multi_tensor_apply_for_fused_optimizer(
   auto wgMetaStorage = at::empty(
       {(int)(sizeof(TLMetaForWG) * totalWG)},
       tensor_lists[0][0].options().dtype(at::kByte));
-  auto metaWGInput = static_cast<TLMetaForWG*>(wgMetaStorage.data_ptr());
+  auto metaWGInput =
+      static_cast<TLMetaForWG*>(wgMetaStorage.mutable_data_ptr());
   TLMetaForWG* tlWGMeta = nullptr;
 
   auto tlWGMeta_dptr = at::xpu::HostAlloc(sizeof(TLMetaForWG) * totalWG);
diff --git a/src/ATen/native/xpu/sycl/NMSKernel.cpp b/src/ATen/native/xpu/sycl/NMSKernel.cpp
index 451555810..3bf712582 100644
--- a/src/ATen/native/xpu/sycl/NMSKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NMSKernel.cpp
@@ -111,8 +111,8 @@ Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold) {
             (size_t)col_blocks, (size_t)col_blocks * nms_items_per_group};
         sycl::range<2> local_range{1, (size_t)nms_items_per_group};
         using acc_t = acc_type_device<scalar_t, kXPU>;
-        auto dets_sorted_ptr = (scalar_t*)dets_sorted.data_ptr();
-        auto mask_ptr = (unsigned long long*)mask.data_ptr();
+        auto dets_sorted_ptr = dets_sorted.data_ptr<scalar_t>();
+        auto mask_ptr = (unsigned long long*)mask.data_ptr<int64_t>();
         auto caller = NMSKernelFunctor<scalar_t, acc_t>(
             dets_num, iou_threshold, dets_sorted_ptr, mask_ptr);
         sycl_kernel_submit(
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index 345d5078a..1c9c1b9d5 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -186,7 +186,7 @@ void parallel_cat(
     int nDims) {
   // First, let's set up our kernel parameters. We start with a raw pointer to
   // the storage for the output Tensor.
-  scalar_out_t* data = static_cast<scalar_out_t*>(out.data_ptr());
+  scalar_out_t* data = static_cast<scalar_out_t*>(out.mutable_data_ptr());
 
   // Kernel Parameter
   int64_t tensorMetadataSize =
@@ -195,7 +195,7 @@ void parallel_cat(
   auto d_inputs_storage =
       at::empty({tensorMetadataSize}, out.options().dtype(at::kByte));
   auto d_inputs = static_cast<CatArrInputTensor<scalar_in_t, unsigned int>*>(
-      d_inputs_storage.data_ptr());
+      d_inputs_storage.mutable_data_ptr());
 
   OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
 
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 57ebb9846..8db72165a 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -1367,9 +1367,9 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, max_vec_size>;
   constexpr int align_bytes = alignof(vec_t);
   int input_start =
-      ((uint64_t)input.data_ptr()) % align_bytes / sizeof(scalar_t);
+      ((uint64_t)input.const_data_ptr()) % align_bytes / sizeof(scalar_t);
   int output_start =
-      ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t);
+      ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t);
 
   // decide indexing range: uint32_t (4GB) or uint64_t (>4GB)
   bool can_use_32bit_index =
@@ -1558,11 +1558,11 @@ void spatial_softmax_backward(
   using vec_t = at::native::memory::aligned_vector<scalar_t, max_vec_size>;
   constexpr int align_bytes = alignof(vec_t);
   int gradin_start =
-      ((uint64_t)gradInput.data_ptr()) % align_bytes / sizeof(scalar_t);
+      ((uint64_t)gradInput.const_data_ptr()) % align_bytes / sizeof(scalar_t);
   int output_start =
-      ((uint64_t)output.data_ptr()) % align_bytes / sizeof(scalar_t);
+      ((uint64_t)output.const_data_ptr()) % align_bytes / sizeof(scalar_t);
   int gradoutput_start =
-      ((uint64_t)gradOutput.data_ptr()) % align_bytes / sizeof(scalar_t);
+      ((uint64_t)gradOutput.const_data_ptr()) % align_bytes / sizeof(scalar_t);
 
   // decide indexing range: uint32_t (4GB) or uint64_t (>4GB)
   bool can_use_32bit_index = canUse32BitIndexMath(gradInput) &&
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index 07fd0be0a..65c165c2c 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -83,8 +83,8 @@ void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
   IndexType result_stride_0 = (IndexType)result.stride(-2);
   IndexType result_stride_1 = (IndexType)result.stride(-1);
 
-  scalar_t* result_ptr = (scalar_t*)(result.data_ptr());
-  scalar_t* self_ptr = (scalar_t*)(self.data_ptr());
+  scalar_t* result_ptr = result.data_ptr<scalar_t>();
+  scalar_t* self_ptr = self.data_ptr<scalar_t>();
 
   ApplyTriuTrilKernelFunctor<scalar_t, IndexType, upper> kfn(
       k,

From e210c5cee1922c820643967d94f39cc30325ff23 Mon Sep 17 00:00:00 2001
From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com>
Date: Mon, 29 Jul 2024 16:14:51 +0800
Subject: [PATCH 13/17] Enable aten::smooth_l1_loss forward/backward (#621)

---
 src/ATen/native/xpu/Loss.cpp                  | 58 +++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template      |  2 -
 .../native/xpu/sycl/BinaryMiscOpsKernels.cpp  | 26 +++++++++
 .../native/xpu/sycl/BinaryMiscOpsKernels.h    |  2 +
 .../native/xpu/sycl/PointwiseOpsKernels.cpp   | 34 +++++++++++
 .../native/xpu/sycl/PointwiseOpsKernels.h     |  2 +
 test/xpu/xpu_test_utils.py                    |  1 +
 yaml/xpu_functions.yaml                       |  3 +
 8 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/Loss.cpp b/src/ATen/native/xpu/Loss.cpp
index f09f68b8a..050ff07b9 100644
--- a/src/ATen/native/xpu/Loss.cpp
+++ b/src/ATen/native/xpu/Loss.cpp
@@ -80,6 +80,64 @@ Tensor& XPUNativeFunctions::mse_loss_backward_out(
   return grad_input;
 }
 
+
+Tensor& XPUNativeFunctions::smooth_l1_loss_out(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& result) {
+  if (reduction != Reduction::None) {
+    TORCH_INTERNAL_ASSERT(
+        reduction == Reduction::Mean || reduction == Reduction::Sum);
+    result.resize_({});
+    Tensor loss;
+    auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
+    native::xpu::smooth_l1_kernel(iter, beta);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    } else {
+      at::sum_out(const_cast<Tensor&>(result), iter.output(), IntArrayRef{});
+    }
+  } else {
+    auto iter = TensorIterator::borrowing_binary_op(result, input, target);
+    native::xpu::smooth_l1_kernel(iter, beta);
+  }
+  return result;
+}
+
+Tensor XPUNativeFunctions::smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta) {
+  Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  result = XPUNativeFunctions::smooth_l1_loss_out(
+      input, target, reduction, beta, result);
+  return result;
+}
+
+Tensor& XPUNativeFunctions::smooth_l1_loss_backward_out(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    double beta,
+    Tensor& grad_input) {
+  auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
+  auto iter = at::TensorIteratorConfig()
+                  .add_output(grad_input)
+                  .add_const_input(input)
+                  .add_const_input(target)
+                  .add_const_input(grad_output)
+                  .promote_inputs_to_common_dtype(true)
+                  .cast_common_dtype_to_outputs(true)
+                  .enforce_safe_casting_to_output(true)
+                  .build();
+  native::xpu::smooth_l1_backward_kernel(iter, norm, beta);
+  return grad_input;
+}
+
 Tensor XPUNativeFunctions::binary_cross_entropy(
     const Tensor& self,
     const Tensor& target,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 3f2064653..93321f23d 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -257,8 +257,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "signbit.out",
     "sign.out",
     "sinc.out",
-    "smooth_l1_loss_backward.grad_input",
-    "smooth_l1_loss.out",
     "special_airy_ai.out",
     "special_bessel_j0.out",
     "special_bessel_j1.out",
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
index 00c5398af..5ac71c163 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.cpp
@@ -23,6 +23,32 @@ void mse_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, MSEFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct SmoothL1Functor {
+  scalar_t operator()(scalar_t input, scalar_t target) const {
+    auto z = std::abs(input - target);
+    return z < beta_val ? scalar_t(0.5) * z * z / beta_val
+                        : z - scalar_t(0.5) * beta_val;
+  }
+  SmoothL1Functor(scalar_t beta_val) : beta_val(beta_val) {}
+
+ private:
+  scalar_t beta_val;
+};
+
+void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "smooth_l1_xpu",
+      [&iter, beta]() {
+        scalar_t beta_val(beta);
+        SmoothL1Functor<scalar_t> f(beta_val);
+        gpu_kernel(iter, f);
+      });
+}
+
 template <typename scalar_t>
 struct HuberFunctor {
   scalar_t operator()(scalar_t a, scalar_t b) const {
diff --git a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
index 94cfb7c90..17672ec29 100644
--- a/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h
@@ -6,6 +6,8 @@ namespace at::native::xpu {
 
 void mse_kernel(TensorIteratorBase& iter);
 
+void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
+
 void huber_kernel(TensorIterator& iter, double delta);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 5dc06e25a..822a83e99 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -125,6 +125,40 @@ void mse_backward_kernel(TensorIterator& iter, const Scalar& value) {
       });
 }
 
+template <typename scalar_t>
+struct SmoothL1BackwardFunctor {
+  scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output)
+      const {
+    const auto x = input - target;
+    if (x < -beta_val)
+      return -norm_val * grad_output;
+    else if (x > beta_val)
+      return norm_val * grad_output;
+    else
+      return norm_val * x * grad_output / beta_val;
+  }
+  SmoothL1BackwardFunctor(scalar_t norm_val, scalar_t beta_val)
+      : norm_val(norm_val), beta_val(beta_val) {}
+
+ private:
+  scalar_t norm_val;
+  scalar_t beta_val;
+};
+
+void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      iter.dtype(),
+      "smooth_l1_backward_xpu",
+      [&iter, &norm, beta] {
+        auto norm_val = norm.to<scalar_t>();
+        scalar_t beta_val(beta);
+        SmoothL1BackwardFunctor<scalar_t> f(norm_val, beta_val);
+        gpu_kernel(iter, f);
+      });
+}
+
 template <typename scalar_t>
 struct HuberBackwardFunctor {
   scalar_t operator()(scalar_t input, scalar_t target, scalar_t grad_output)
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index 586a64f3c..613c3cca6 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -10,6 +10,8 @@ void addcdiv_kernel(TensorIterator& iter, Scalar value);
 
 void mse_backward_kernel(TensorIterator& iter, const Scalar& value);
 
+void smooth_l1_backward_kernel(TensorIterator& iter, Scalar norm, double beta);
+
 void huber_backward_kernel(
     TensorIterator& iter,
     const Scalar& norm,
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 2a36ddfb0..c281747f2 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -179,6 +179,7 @@
     "nn.functional.upsample_bilinear",
     "nn.functional.upsample_nearest",
     "nn.functional.nll_loss",
+    "nn.functional.smooth_l1_loss",
     "nn.functional.mse_loss",
     "nn.functional.binary_cross_entropy",
     "nn.functional.huber_loss",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 77d77d4f0..fd087c7bc 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -308,6 +308,9 @@ supported:
   - bitwise_and.Tensor_out
   - bitwise_or.Tensor_out
   - bitwise_xor.Tensor_out
+  - smooth_l1_loss
+  - smooth_l1_loss.out
+  - smooth_l1_loss_backward.grad_input
   - bitwise_not.out
   - where.self_out
   - where.self

From 36dfe230dea6a737fe260b072276cbcca3ca3f9a Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 30 Jul 2024 08:40:20 +0800
Subject: [PATCH 14/17] Add aten::polar and its variants (#606)

Co-authored-by: yucai <huaiyu.zheng@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/TensorFactories.cpp     | 15 +++++++++++++++
 src/ATen/native/xpu/XPUFallback.template    |  1 -
 src/ATen/native/xpu/sycl/ComplexKernels.cpp | 14 ++++++++++++++
 src/ATen/native/xpu/sycl/ComplexKernels.h   |  2 ++
 test/xpu/extended/run_test_with_skip.py     |  4 ++++
 test/xpu/run_test_with_skip.py              | 12 ++++++++++--
 test/xpu/xpu_test_utils.py                  |  1 +
 yaml/xpu_functions.yaml                     |  1 +
 8 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index 110590958..44da487f7 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -151,6 +151,21 @@ Tensor& XPUNativeFunctions::complex_out(
   return result;
 }
 
+Tensor& XPUNativeFunctions::polar_out(
+    const Tensor& abs,
+    const Tensor& angle,
+    Tensor& result) {
+  complex_check_dtype(result, abs, angle);
+  auto iter = TensorIteratorConfig()
+                  .add_output(result)
+                  .add_const_input(abs)
+                  .add_const_input(angle)
+                  .check_all_same_dtype(false)
+                  .build();
+  native::xpu::polar_kernel(iter);
+  return result;
+}
+
 Tensor& XPUNativeFunctions::randperm_out(
     int64_t n,
     c10::optional<Generator> generator,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 93321f23d..4a4c96828 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -240,7 +240,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "ormqr",
     "_pdist_backward",
     "_pdist_forward",
-    "polar.out",
     "_prelu_kernel",
     "_prelu_kernel_backward",
     "prod",
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
index 56b25d0ef..87504bd5e 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -21,4 +21,18 @@ void complex_kernel(TensorIterator& iter) {
       });
 }
 
+template <typename scalar_t>
+struct PolarFunctor {
+  c10::complex<scalar_t> operator()(scalar_t a, scalar_t b) const {
+    return c10::complex<scalar_t>(a * std::cos(b), a * std::sin(b));
+  }
+};
+
+void polar_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "polar_xpu", [&]() {
+    PolarFunctor<scalar_t> f;
+    gpu_kernel(iter, f);
+  });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h
index 990bcd14e..d51556b4f 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.h
@@ -6,4 +6,6 @@ namespace at::native::xpu {
 
 void complex_kernel(TensorIterator& iter);
 
+void polar_kernel(TensorIterator& iter);
+
 } // namespace at::native::xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 6f8fe8d3a..a75d2e675 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -154,6 +154,10 @@
     # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
     "test_compare_cpu_hypot_xpu_bfloat16",
 
+    # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
+    # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
+    "test_compare_cpu_polar_xpu_bfloat16",
+
     # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
     # https://github.com/intel/torch-xpu-ops/issues/549
     # Example fail log
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 719af3ca4..7d051607e 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -782,6 +782,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf'
     "test_dtypes_cosh_xpu",
 
+    # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
+    # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
+    "test_dtypes_polar_xpu",
+
     # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
     # but test_dtypes infrastructure leverage CUDA supported datatypes
     "test_dtypes_histogram_xpu",
@@ -3016,8 +3020,12 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 res += launch_test("nn/test_load_state_dict_xpu.py")
 
 # test_module_hooks
-
-res += launch_test("nn/test_module_hooks_xpu.py")
+skip_list = (
+    # TypeError: TestStateDictHooks.test_register_state_dict_post_hook() missing 1 required positional argument: 'private'
+    # https://github.com/intel/torch-xpu-ops/issues/658
+    "test_register_state_dict_post_hook",
+)
+res += launch_test("nn/test_module_hooks_xpu.py", skip_list)
 
 # test_parametrization
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index c281747f2..823988488 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -208,6 +208,7 @@
     "unique",
     "multinomial",
     "lerp",
+    "polar",
     "frac",
     "aminmax",
     "argmin",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index fd087c7bc..9d453d215 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -268,6 +268,7 @@ supported:
   - eye.m_out
   - _efficientzerotensor
   - complex.out
+  - polar.out
   - clone
   - fill_.Scalar
   - fill_.Tensor

From 82268376d9f215f5ca0988264608ead45ae028f9 Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Tue, 30 Jul 2024 13:19:00 +0800
Subject: [PATCH 15/17] Enable weekly test (#637)

1. enable weekly test contains 3 suites full e2e and full ut
2. always() to not cancelled()
---
 .github/workflows/nightly_ondemand.yml | 59 +++++++++++++++++++++-----
 .github/workflows/pull.yml             |  4 +-
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 1a663661e..e5be18fb5 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -2,8 +2,10 @@ name: Nightly-OnDemand Tests
 
 on:
   schedule:
-    # GMT+8 21:00 every day
-    - cron: '0 13 * * *'
+    # GMT+8 21:00 every workday
+    - cron: '0 13 * * 0-4'
+    # GMT+8 0:00 Saturday
+    - cron: '0 16 * * 5'
   workflow_dispatch:
     inputs:
       pytorch:
@@ -78,7 +80,7 @@ jobs:
     runs-on: pvc_e2e
     # Don't run on forked repos
     if: github.repository_owner == 'intel'
-    timeout-minutes: 900
+    timeout-minutes: 3600
     env:
       pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
       keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
@@ -174,8 +176,10 @@ jobs:
           echo "$GITHUB_ENV"
           rm -rf ../pytorch/inductor_log
           rm -rf /tmp/torchinductor_*
+
+      # Nihglty launch
       - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: huggingface
@@ -185,7 +189,7 @@ jobs:
           scenario: accuracy
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: torchbench
@@ -195,7 +199,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' && github.event.schedule == '0 13 * * 0-4'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: timm_models
@@ -204,6 +208,38 @@ jobs:
           scenario: accuracy
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      # Weekly launch
+      - name: Weekly Huggingface Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: huggingface
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Weekly Torchbench Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: torchbench
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Weekly Timm_models Full Test
+        if: github.event_name == 'schedule' && github.event.schedule == '0 16 * * 5'
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: timm_models
+          env_prepare: true
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
+          mode: inference,training
+          scenario: accuracy,performance
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      # On-demand launch
       - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
         if: github.event_name != 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
@@ -216,7 +252,7 @@ jobs:
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Summarize archieve files
         id: summary
-        if: always()
+        if: ${{ ! cancelled() }}
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
@@ -237,14 +273,14 @@ jobs:
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data
-        if: always()
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
-      
+
   Tests-Failure-And-Report:
-    if: always()
+    if: ${{ ! cancelled() }}
     runs-on: pvc_e2e
     permissions:
       issues: write
@@ -288,6 +324,9 @@ jobs:
             test_type="On-demand"
             test_issue_id=426
             cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
+          elif [ "${{ github.event.schedule }}" == "0 16 * * 5" ];then
+            test_type="Weekly"
+            test_issue_id=432
           else
             test_type="Nightly"
             test_issue_id=432
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5ee55fdcf..350c88f91 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -127,7 +127,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Summarize archieve files
-        if: always()
+        if: ${{ ! cancelled() }}
         run: |
           rm -rf ${{ github.workspace }}/upload_files
           cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
@@ -137,7 +137,7 @@ jobs:
             exit 1
           fi
       - name: Upload Inductor XPU E2E Data
-        if: always()
+        if: ${{ ! cancelled() }}
         uses: actions/upload-artifact@v4
         with:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}

From 67116b3c359c619f0fdb9b0266f4fbd6607cc956 Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Tue, 30 Jul 2024 13:55:12 +0800
Subject: [PATCH 16/17] Add aten::masked_scatter_ (#652)

Add aten::masked_scatter_.

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Indexing.cpp              | 30 +++++++
 src/ATen/native/xpu/XPUFallback.template      |  1 -
 src/ATen/native/xpu/sycl/Indexing.cpp         | 90 +++++++++++++++++++
 src/ATen/native/xpu/sycl/IndexingKernels.h    |  6 ++
 src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h | 13 ++-
 test/xpu/test_torch_xpu.py                    |  4 +-
 test/xpu/xpu_test_utils.py                    |  1 +
 yaml/xpu_functions.yaml                       |  1 +
 8 files changed, 136 insertions(+), 10 deletions(-)

diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index 5db9a7238..d4d5598e6 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -45,6 +45,36 @@ Tensor XPUNativeFunctions::index_select(
   return index_select_out(self, dim, index, out);
 }
 
+Tensor& XPUNativeFunctions::masked_scatter_(
+    Tensor& self,
+    const Tensor& mask,
+    const Tensor& source) {
+  at::assert_no_internal_overlap(self);
+  TORCH_CHECK(
+      self.scalar_type() == source.scalar_type(),
+      "masked_scatter_: expected self and source to have same dtypes but got ",
+      self.scalar_type(),
+      " and ",
+      source.scalar_type());
+  TORCH_CHECK(
+      mask.dtype() == ScalarType::Bool,
+      "masked_scatter_ only supports boolean masks, "
+      "but got mask with dtype ",
+      mask.dtype());
+
+  c10::MaybeOwned<Tensor> b_mask =
+      expand_inplace(self, mask, "masked_scatter_");
+
+  if (self.numel() == 0) {
+    return self;
+  }
+
+  auto maskPrefixSum = at::empty(self.sizes(), mask.options().dtype(kLong));
+  native::xpu::masked_scatter_kernel(self, *b_mask, maskPrefixSum, source);
+
+  return self;
+}
+
 static Tensor& masked_select_out_impl(
     Tensor& result,
     const Tensor& self,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 4a4c96828..23b79d6dd 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -224,7 +224,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "log_normal_",
     "logspace.out",
     "lu_unpack.out",
-    "masked_scatter_",
     "max_pool3d_with_indices",
     "max_pool3d_with_indices_backward",
     "max_unpool2d",
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 312f4ce12..fcee372d1 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -597,6 +597,7 @@ void index_put_deterministic_kernel(
 
   if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) {
     auto expanded_size = at::DimVector(expandedValue.sizes());
+
     auto size1 = expandedValue.sizes();
     auto size2 = linearIndex.sizes();
     if (are_expandable(size1, size2)) {
@@ -667,6 +668,95 @@ void index_put_deterministic_kernel(
   }
 }
 
+template <typename scalar_t>
+struct MaskedScatterElementwiseFunctor {
+  scalar_t operator()(
+      const scalar_t a,
+      const bool mask,
+      const int64_t maskPrefixSum) const {
+    if (mask) {
+      return source_ptr_[maskPrefixSum];
+    }
+    return a;
+  }
+  MaskedScatterElementwiseFunctor(const scalar_t* source_ptr)
+      : source_ptr_(source_ptr) {}
+
+ private:
+  const scalar_t* source_ptr_;
+};
+
+struct MaskedScatterSizeCheckFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    const auto totalElements = *mask_exclusive_sum_ + *mask_;
+    SYCL_KERNEL_ASSERT(totalElements <= srcSize_);
+  }
+  MaskedScatterSizeCheckFunctor(
+      const int64_t* const mask_exclusive_sum,
+      const bool* const mask,
+      const int64_t srcSize)
+      : mask_exclusive_sum_(mask_exclusive_sum),
+        mask_(mask),
+        srcSize_(srcSize) {}
+
+ private:
+  const int64_t* const mask_exclusive_sum_;
+  const bool* const mask_;
+  const int64_t srcSize_;
+};
+
+void masked_scatter_kernel(
+    const TensorBase& self,
+    const TensorBase& mask,
+    const TensorBase& maskPrefixSum,
+    const TensorBase& source) {
+  const auto srcSize = source.numel();
+  const auto mask_cont = mask.contiguous();
+  const auto mask_numel = mask.numel();
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  auto maskPrefixSum_data = maskPrefixSum.mutable_data_ptr<int64_t>();
+  auto mask_data = mask_cont.const_data_ptr<bool>();
+
+  pstl::exclusive_scan(
+      mask_data,
+      mask_data + mask_numel,
+      maskPrefixSum_data,
+      static_cast<int64_t>(0));
+
+  // Asynchronously check that the number of `1` elements present in the mask
+  // must be <= the number of elements available in `src`.
+  auto caller = MaskedScatterSizeCheckFunctor(
+      &maskPrefixSum_data[mask_numel - 1], &mask_data[mask_numel - 1], srcSize);
+  sycl_kernel_submit((size_t)1, (size_t)1, getCurrentSYCLQueue(), caller);
+
+  // We are getting elements from `src` based on an offset from
+  // `maskPrefixSum`, so that should be made contiguous too
+  auto source_contig = source.contiguous();
+
+  auto iter = TensorIteratorConfig()
+                  .set_check_mem_overlap(false)
+                  .check_all_same_dtype(false)
+                  .resize_outputs(false)
+                  .add_output(self)
+                  .add_input(self)
+                  .add_const_input(mask_cont)
+                  .add_input(maskPrefixSum)
+                  .build();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Bool,
+      ScalarType::BFloat16,
+      ScalarType::Half,
+      self.scalar_type(),
+      "masked_scatter_",
+      [&]() {
+        auto source_ptr = source_contig.const_data_ptr<scalar_t>();
+        gpu_kernel(iter, MaskedScatterElementwiseFunctor<scalar_t>(source_ptr));
+      });
+}
+
 } // namespace at::native::xpu
+
 #pragma GCC diagnostic pop
 #pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/IndexingKernels.h b/src/ATen/native/xpu/sycl/IndexingKernels.h
index cde537e73..8f32f49f9 100644
--- a/src/ATen/native/xpu/sycl/IndexingKernels.h
+++ b/src/ATen/native/xpu/sycl/IndexingKernels.h
@@ -47,4 +47,10 @@ void index_put_deterministic_kernel(
     bool accumulate,
     bool unsafe);
 
+void masked_scatter_kernel(
+    const TensorBase& self,
+    const TensorBase& mask,
+    const TensorBase& maskPrefixSum,
+    const TensorBase& source);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
index 66c316e25..efc9f164e 100644
--- a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
+++ b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
@@ -1,11 +1,10 @@
 #pragma once
 
 #include <ATen/ceil_div.h>
-#include <ATen/record_function.h>
-
 #include <ATen/native/xpu/sycl/MemoryAccess.h>
 #include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
 #include <ATen/native/xpu/sycl/SortingKernels.h>
+#include <ATen/record_function.h>
 #include <comm/SYCLContext.h>
 #include <comm/SYCLHelpers.h>
 #include <comm/TensorOptions.h>
@@ -23,10 +22,10 @@ struct KSScanKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
     // initialize local_input
     auto cur_init = init_;
     if (scan_type == 1) {
-      local_scan_[local_id] = first_[local_id];
+      local_scan_[local_id] = c10::load(&first_[local_id]);
     } else {
       if (local_id > 0)
-        local_scan_[local_id] = first_[local_id - 1];
+        local_scan_[local_id] = c10::load(&first_[local_id - 1]);
       else
         local_scan_[local_id] = 0;
     }
@@ -72,17 +71,17 @@ struct KSScanWithCarrierKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
     auto cur_init = (group_id == 0 ? init_ : 0);
     if (global_id < N_) {
       if (scan_type == 1) {
-        local_scan_[local_id] = first_[global_id];
+        local_scan_[local_id] = c10::load(&first_[global_id]);
       } else {
         if (local_id > 0)
-          local_scan_[local_id] = first_[global_id - 1];
+          local_scan_[local_id] = c10::load(&first_[global_id - 1]);
         else
           local_scan_[local_id] = 0;
       }
       if (local_id == 0)
         local_scan_[local_id] += cur_init;
       if (local_id == wgroup_size_ - 1) {
-        carry_ptr_[group_id] = first_[global_id];
+        carry_ptr_[group_id] = c10::load(&first_[global_id]);
       }
     }
     item_id.barrier(sycl_local_fence);
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
index b82a8ec67..80fb3c8b0 100644
--- a/test/xpu/test_torch_xpu.py
+++ b/test/xpu/test_torch_xpu.py
@@ -3995,11 +3995,11 @@ def test_masked_scatter(self, device, dtype):
         dest_ones.masked_scatter_(mask, src_ones)
         self.assertEqual(dest_ones, dest_ones_expected, atol=0, rtol=0)
 
-        # Bound checking in CUDA is done inside a kernel
+        # Bound checking in GPU is done inside a kernel
         # in order to avoid synchronization, but this means
         # we can not clear the failures. So there is no way
         # to test it then recover.
-        if self.device_type != 'cuda' or self.device_type != 'xpu':
+        if self.device_type != 'cuda' and self.device_type != 'xpu':
             # make src smaller. this should fail
             src = torch.zeros(num_copy - 1, dtype=dt, device=device)
             with self.assertRaises(RuntimeError):
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 823988488..635b1f1b6 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -80,6 +80,7 @@
     "index_fill",
     "index_put",
     "index_select",
+    "masked_scatter",
     "masked_select",
     "isin",
     "isnan",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 9d453d215..5c354e0b0 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -105,6 +105,7 @@ supported:
   - isnan.out
   - masked_fill_.Tensor
   - masked_fill_.Scalar
+  - masked_scatter_
   - index_add.out
   - index_add_
   - index_add

From 8a821bf6aaa72f733ba65f6b2cbef40983bd71a9 Mon Sep 17 00:00:00 2001
From: "Huaiyu, Zheng" <huaiyu.zheng@intel.com>
Date: Tue, 30 Jul 2024 14:37:25 +0800
Subject: [PATCH 17/17] Add aten::_weight_norm_interface and
 aten::_weight_norm_interface_backward (#607)

add aten::_weight_norm_interface and
aten::_weight_norm_interface_backward
---
 src/ATen/native/xpu/WeightNorm.cpp            |   27 +
 .../xpu/sycl/AdaptiveMaxPooling2dKernels.cpp  |    8 +-
 src/ATen/native/xpu/sycl/BatchKernel.h        |  237 ++--
 src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp |   13 +-
 src/ATen/native/xpu/sycl/EmbeddingBag.cpp     |    5 +-
 src/ATen/native/xpu/sycl/Indexing.h           |   14 +-
 .../native/xpu/sycl/WeightNormKernels.cpp     | 1027 +++++++++++++++++
 src/ATen/native/xpu/sycl/WeightNormKernels.h  |   16 +
 yaml/xpu_functions.yaml                       |    2 +
 9 files changed, 1234 insertions(+), 115 deletions(-)
 create mode 100644 src/ATen/native/xpu/WeightNorm.cpp
 create mode 100644 src/ATen/native/xpu/sycl/WeightNormKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/WeightNormKernels.h

diff --git a/src/ATen/native/xpu/WeightNorm.cpp b/src/ATen/native/xpu/WeightNorm.cpp
new file mode 100644
index 000000000..7fec9ecfe
--- /dev/null
+++ b/src/ATen/native/xpu/WeightNorm.cpp
@@ -0,0 +1,27 @@
+#include <ATen/native/xpu/sycl/WeightNormKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+namespace at {
+std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface(
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim) {
+  return native::xpu::weight_norm_kernel(v, g, dim);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::_weight_norm_interface_backward(
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norms,
+    int64_t dim) {
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  TORCH_CHECK(
+      dim == 0 || dim == saved_v.dim() - 1,
+      "fused kernels can only be applied for first or last dim")
+
+  return native::xpu::weight_norm_backward_kernel(
+      grad_w, saved_v, saved_g, saved_norms, dim);
+}
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
index fe50d1a6c..9da231b0d 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.cpp
@@ -124,8 +124,8 @@ void launch_adaptive_max_pool2d_kernel(
   using KernelClass = AdaptiveMaxPool2dKernelFunctor<scalar_t, index_t>;
 
   int64_t output_size = batch * plane * osizeH * osizeW;
-  BatchKernelConfig cfg = {
-      1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, output_size, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
 
   cfg.build<KernelClass>();
 
@@ -301,8 +301,8 @@ void launch_adaptive_max_pool2d_backward_kernel(
     int64_t sizeP) {
   using KernelClass = AdaptiveMaxPool2dBackwardKernelFunctor<scalar_t, index_t>;
 
-  BatchKernelConfig cfg = {
-      1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, osize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
 
   cfg.build<KernelClass>();
 
diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h
index cff967a76..bc0da3bb5 100644
--- a/src/ATen/native/xpu/sycl/BatchKernel.h
+++ b/src/ATen/native/xpu/sycl/BatchKernel.h
@@ -25,6 +25,146 @@ class BatchKernelConfig {
   }
 
  public:
+  template <class KernelClass = class NullKernelClass>
+  static BatchKernelConfig make_config(
+      int64_t batch,
+      int64_t problem,
+      int64_t stride,
+      int64_t problem_batch,
+      bool problem_along_x,
+      Policy policy = Policy::pSegment,
+      int64_t prefer_wg_size = 0) {
+    BatchKernelConfig cfg = {
+        batch,
+        problem,
+        stride,
+        problem_batch,
+        problem_along_x,
+        policy,
+        prefer_wg_size};
+    cfg.template build<KernelClass>();
+
+    return cfg;
+  }
+
+  template <class KernelClass = class NullKernelClass>
+  static BatchKernelConfig make_config(
+      int64_t batch,
+      int64_t problem,
+      int64_t stride,
+      int64_t problem_batch,
+      bool problem_along_x,
+      std::vector<Policy> policies,
+      int64_t prefer_wg_size = 0) {
+    BatchKernelConfig cfg = {
+        batch,
+        problem,
+        stride,
+        problem_batch,
+        problem_along_x,
+        policies,
+        prefer_wg_size};
+    cfg.template build<KernelClass>();
+
+    return cfg;
+  }
+
+  sycl::range<2> global_size() const {
+    return {glb_range_y_, glb_range_x_};
+  }
+
+  sycl::range<2> group_size() const {
+    return {wg_range_y_, wg_range_x_};
+  }
+
+  struct ItemDesc {
+    /* chunk id along problem dim */ size_t chunk;
+    /* problem chunk size */ size_t chunk_size;
+    /* offsite in current chunk */ size_t chunk_off;
+    /* how many active chunks along problem dim */ size_t chunk_num;
+    /* global batch id */ size_t glb_batch;
+    /* global problem id */ size_t glb_problem;
+  };
+
+  ItemDesc get_item_desc(sycl::nd_item<2> item) const {
+    auto lix = item.get_local_id(1);
+    auto liy = item.get_local_id(0);
+    auto lrx = item.get_local_range(1);
+    auto lry = item.get_local_range(0);
+    auto wgrx = item.get_group_range(1);
+    auto wgry = item.get_group_range(0);
+    auto gix = item.get_global_id(1);
+    auto giy = item.get_global_id(0);
+    auto gx = item.get_group(1);
+    auto gy = item.get_group(0);
+
+    // ItemDesc::glb_problem is meaningless, if policy is loop for all.
+    if (problem_along_x_) {
+      return {gx, lrx, lix, wgrx, giy, gix};
+    } else {
+      return {gy, lry, liy, wgry, gix, giy};
+    }
+  }
+
+  // iterate over problems and batchs for `pAdaptive` policy
+  // # update workload status inplace in `desc`.
+  // # prioritize problem iteration.
+  bool next(sycl::nd_item<2> item, ItemDesc& desc) const {
+    auto next_problem = desc.glb_problem + problem_glb_range_;
+    auto next_batch = desc.glb_batch + batch_glb_range_;
+    auto cur_chunk = desc.chunk;
+
+    // WA: break deduce chain, or offline compiler gets crash, due to,
+    // massive and deep divergence level
+    desc = get_item_desc(item);
+
+    // iterate over problem
+    if (next_problem < problem_range_) {
+      desc.glb_problem = next_problem;
+      desc.chunk = cur_chunk + desc.chunk_num;
+      return true;
+    }
+
+    // iterate over batch
+    if (next_batch < batch_range_) {
+      desc.glb_batch = next_batch;
+      return true;
+    }
+
+    return false;
+  }
+
+  static Policy suggest_policy(
+      int64_t batch,
+      int64_t problem,
+      int64_t stride,
+      bool problem_along_x,
+      bool bypass_adaptive_policy = true) {
+    auto target_wi_num = syclMaxWorkItemsPerTile();
+
+    if (!bypass_adaptive_policy && batch * problem * stride >= target_wi_num) {
+      return Policy::pAdaptive;
+    }
+    // Using device max work group size to deduce range configuration
+    // approximately.
+    BatchKernelConfig cfg_ = make_config(
+        batch,
+        problem,
+        stride,
+        batch * stride,
+        problem_along_x,
+        Policy::pLoop,
+        syclDeviceMaxWorkGroupSize());
+    size_t wg_num = (cfg_.glb_range_x_ / cfg_.wg_range_x_) *
+        (cfg_.glb_range_y_ / cfg_.wg_range_y_);
+    size_t wg_size = cfg_.wg_range_x_ * cfg_.wg_range_y_;
+    if (wg_size * (wg_num + 1) > target_wi_num) {
+      return Policy::pLoop;
+    }
+
+    return Policy::pSegment;
+  }
+
   BatchKernelConfig(
       int64_t batch,
       int64_t problem,
@@ -52,11 +192,14 @@ class BatchKernelConfig {
 
   template <class KernelClass>
   void build() {
-    size_t wg_size = syclMaxWorkGroupSize<KernelClass>();
+    size_t wg_size;
     size_t sg_size = syclMaxSubGroupSize();
+    // Caller takes responsibility of if work group size is valid or compatible.
     if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 &&
-        prefer_wg_size_ < wg_size) {
+        prefer_wg_size_ <= syclDeviceMaxWorkGroupSize()) {
       wg_size = prefer_wg_size_;
+    } else {
+      wg_size = syclMaxWorkGroupSize<KernelClass>();
     }
     wg_range_x_ = sg_size;
     wg_range_y_ = wg_size / wg_range_x_;
@@ -170,96 +313,6 @@ class BatchKernelConfig {
             }(),
             prefer_wg_size) {}
 
-  sycl::range<2> global_size() const {
-    return {glb_range_y_, glb_range_x_};
-  }
-
-  sycl::range<2> group_size() const {
-    return {wg_range_y_, wg_range_x_};
-  }
-
-  struct ItemDesc {
-    /* chunk id along problem dim */ size_t chunk;
-    /* problem chunk size */ size_t chunk_size;
-    /* offsite in current chunk */ size_t chunk_off;
-    /* how many active chunks along problem dim */ size_t chunk_num;
-    /* global batch id */ size_t glb_batch;
-    /* global problem id */ size_t glb_problem;
-  };
-
-  ItemDesc get_item_desc(sycl::nd_item<2> item) const {
-    auto lix = item.get_local_id(1);
-    auto liy = item.get_local_id(0);
-    auto lrx = item.get_local_range(1);
-    auto lry = item.get_local_range(0);
-    auto wgrx = item.get_group_range(1);
-    auto wgry = item.get_group_range(0);
-    auto gix = item.get_global_id(1);
-    auto giy = item.get_global_id(0);
-    auto gx = item.get_group(1);
-    auto gy = item.get_group(0);
-
-    // ItemDesc::glb_problem is meaningless, if policy is loop for all.
-    if (problem_along_x_) {
-      return {gx, lrx, lix, wgrx, giy, gix};
-    } else {
-      return {gy, lry, liy, wgry, gix, giy};
-    }
-  }
-
-  // iterate over problems and batchs for `pAdaptive` policy
-  // # update workload status inplace in `desc`.
-  // # prioritize problem iteration.
-  bool next(sycl::nd_item<2> item, ItemDesc& desc) const {
-    auto next_problem = desc.glb_problem + problem_glb_range_;
-    auto next_batch = desc.glb_batch + batch_glb_range_;
-    auto cur_chunk = desc.chunk;
-
-    // WA: break deduce chain, or offline compiler gets crash, due to,
-    // massive and deep divergence level
-    desc = get_item_desc(item);
-
-    // iterate over problem
-    if (next_problem < problem_range_) {
-      desc.glb_problem = next_problem;
-      desc.chunk = cur_chunk + desc.chunk_num;
-      return true;
-    }
-
-    // iterate over batch
-    if (next_batch < batch_range_) {
-      desc.glb_batch = next_batch;
-      return true;
-    }
-
-    return false;
-  }
-
-  static Policy suggest_policy(
-      int64_t batch,
-      int64_t problem,
-      int64_t stride,
-      bool problem_along_x,
-      bool bypass_adaptive_policy = true) {
-    auto target_wi_num = syclMaxWorkItemsPerTile();
-
-    if (!bypass_adaptive_policy && batch * problem * stride >= target_wi_num) {
-      return Policy::pAdaptive;
-    }
-
-    BatchKernelConfig cfg_ = {
-        batch, problem, stride, batch * stride, problem_along_x, Policy::pLoop};
-    size_t wg_num = (cfg_.glb_range_x_ / cfg_.wg_range_x_) *
-        (cfg_.glb_range_y_ / cfg_.wg_range_y_);
-    size_t wg_size = cfg_.wg_range_x_ * cfg_.wg_range_y_;
-
-    if (wg_size * (wg_num + 1) > target_wi_num) {
-      return Policy::pLoop;
-    }
-
-    return Policy::pSegment;
-  }
-
  public:
   /* logical shape desc */ int64_t batch_;
   /* logical shape desc */ int64_t problem_;
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 7fd51ddf3..eab6f4c48 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -371,9 +371,8 @@ void launch_max_pool2d_kernel(
   auto& queue = at::xpu::getCurrentSYCLQueue();
   int outputSize = numBatch * numPlane * outputSizeH * outputSizeW;
   int stride = numPlane * outputSizeH * outputSizeW;
-  BatchKernelConfig cfg = {
-      1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-  cfg.template build<KernelClass>();
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
   auto kfn = KernelClass(
       output,
       indices,
@@ -436,8 +435,8 @@ void launch_max_pool2d_backward_kernel(
 
   using KernelClass =
       MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
-  BatchKernelConfig cfg = {
-      1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
   cfg.template build<KernelClass>();
   auto kfn = KernelClass(
       gradInput,
@@ -468,8 +467,8 @@ void launch_max_pool2d_backward_kernel(
       numBatch * numPlane * gradOutputSizeH * gradOutputSizeW;
   using KernelClass =
       MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
-  BatchKernelConfig cfg = {
-      1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive);
   cfg.template build<KernelClass>();
   auto kfn = KernelClass(
       gradInput,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 5b9e4fa6b..0c7338d5d 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -57,9 +57,8 @@ void embedding_bag(
   vec_idx_t* max_idx_vec = reinterpret_cast<vec_idx_t*>(max_index);
 
   vec_len = vec_len / vec_size;
-  BatchKernelConfig cfg = {
-      bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive};
-  cfg.template build<KernelClass>();
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive);
 
   index_t fixing_bag_size = ignore_offsets ? index_size / bag_num : 0;
   auto kfn = KernelClass(
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index d78ddfbf6..fcb429ef5 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -866,21 +866,17 @@ void launch_index_put_deterministic_kernel(
     return;
   }
   int64_t v_stride_before = numel * stride;
-  BatchKernelConfig cfg = {
+  // align with precision of CPU backend.
+  using accscalar_t = scalar_t; /* acc_type<scalar_t>; */
+  using KernelClass = IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
       /* num of indices */ numel,
       /* num of elements to put per indices */ outer_dim * stride,
       1,
       numel,
       true,
       {BatchKernelConfig::Policy::pSegment,
-       BatchKernelConfig::Policy::pAggressiveSplit}};
-
-  // align with precision of CPU backend.
-  using accscalar_t = scalar_t; /* acc_type<scalar_t>; */
-  using KernelClass = IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t>;
-
-  cfg.template build<KernelClass>();
-
+       BatchKernelConfig::Policy::pAggressiveSplit});
   KernelClass kfn(
       sorted_indices,
       indices,
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.cpp b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
new file mode 100644
index 000000000..dd93f68c3
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.cpp
@@ -0,0 +1,1027 @@
+#include <ATen/AccumulateType.h>
+#include <ATen/core/Array.h>
+#include <ATen/native/xpu/sycl/BatchKernel.h>
+#include <ATen/native/xpu/sycl/Reduce.h>
+#include <comm/SYCLContext.h>
+#include <comm/TensorInfo.h>
+#include <comm/TensorOptions.h>
+#include "comm/Runtime.h"
+
+namespace at::native::xpu {
+
+template <typename T>
+struct ReduceAdd {
+  T operator()(const T a, const T b) const {
+    return a + b;
+  }
+};
+
+template <
+    class ScalarTypeInfo,
+    class AccTypeInfo,
+    typename scalar_t,
+    typename accscalar_t,
+    typename vec_t>
+struct WeightNormReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t ldr_pi = id.chunk * id.chunk_size + id.chunk_off;
+    int64_t str_pi = id.chunk;
+    int64_t ldr_lid =
+        si + ldr_pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_;
+    int64_t ldr_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        ldr_lid,
+        iinfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+    int64_t str_lid =
+        si + str_pi * cfg_.stride_ + bi * id.chunk_num * cfg_.stride_;
+    int64_t str_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        str_lid,
+        oinfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    accscalar_t value = 0;
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      value = (accscalar_t)iinfo_.data[ldr_off];
+      if (need_squre_)
+        value *= value;
+    }
+
+    if (cfg_.problem_along_x_) {
+      value = group_x_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    } else {
+      value = group_y_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    }
+
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      if (id.chunk_off == 0) {
+        oinfo_.data[str_off] = is_final_ ? sqrtf(value) : value;
+      }
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<accscalar_t>(shared_memeory_size_, cgh);
+  }
+  WeightNormReduceKernelFunctor(
+      ScalarTypeInfo iinfo,
+      AccTypeInfo oinfo,
+      BatchKernelConfig cfg,
+      bool need_squre,
+      bool is_final,
+      int64_t shared_memeory_size)
+      : iinfo_(iinfo),
+        oinfo_(oinfo),
+        cfg_(cfg),
+        need_squre_(need_squre),
+        is_final_(is_final),
+        shared_memeory_size_(shared_memeory_size) {}
+
+ private:
+  ScalarTypeInfo iinfo_;
+  AccTypeInfo oinfo_;
+  BatchKernelConfig cfg_;
+  bool need_squre_;
+  bool is_final_;
+  int64_t shared_memeory_size_;
+  sycl_local_acc_t<accscalar_t> shared_;
+};
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void launch_weight_norm_reduce_kernel(
+    ScalarTypeInfo& iinfo,
+    AccTypeInfo& oinfo,
+    BatchKernelConfig& cfg,
+    bool need_squre,
+    bool is_final) {
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+
+  WeightNormReduceKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t,
+      vec_t>
+      kfn(iinfo, oinfo, cfg, need_squre, is_final, cfg.group_size().size());
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+}
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void weight_norm_reduce(
+    ScalarTypeInfo& vinfo,
+    AccTypeInfo& ninfo,
+    int dim_after_collapse,
+    bool need_square) {
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+  using KernelClass = WeightNormReduceKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t,
+      vec_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch, problem, stride, batch * stride, problem_along_x);
+
+  if (cfg.problem_ <= cfg.problem_wg_range_) {
+    launch_weight_norm_reduce_kernel(vinfo, ninfo, cfg, need_square, true);
+    return;
+  }
+
+  Tensor carrier = at::empty(
+      {cfg.batch_, cfg.problem_glb_range_ / cfg.problem_wg_range_, cfg.stride_},
+      map_options<typename AccTypeInfo::scalar_t>());
+  auto cinfo =
+      at::xpu::detail::getTensorInfo<typename AccTypeInfo::scalar_t, int64_t>(
+          carrier);
+  launch_weight_norm_reduce_kernel(vinfo, cinfo, cfg, need_square, false);
+
+  weight_norm_reduce(cinfo, ninfo, 1, false);
+  return;
+}
+
+template <
+    class ScalarTypeInfo,
+    class AccTypeInfo,
+    typename scalar_t,
+    typename accscalar_t>
+struct SegmentWeightNormKernelFunctor {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t pi = id.chunk * id.chunk_size + id.chunk_off;
+    int64_t w_lid = si + pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_;
+    int64_t n_lid = id.glb_batch;
+
+    int64_t v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        w_lid,
+        vinfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t w_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        w_lid,
+        winfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t g_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        n_lid,
+        ginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t n_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        n_lid,
+        ninfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      winfo_.data[w_off] =
+          (1.f / ninfo_.data[n_off]) * vinfo_.data[v_off] * ginfo_.data[g_off];
+    }
+  }
+  SegmentWeightNormKernelFunctor(
+      ScalarTypeInfo vinfo,
+      ScalarTypeInfo ginfo,
+      ScalarTypeInfo winfo,
+      AccTypeInfo ninfo,
+      BatchKernelConfig cfg)
+      : vinfo_(vinfo), ginfo_(ginfo), winfo_(winfo), ninfo_(ninfo), cfg_(cfg) {}
+
+ private:
+  ScalarTypeInfo vinfo_;
+  ScalarTypeInfo ginfo_;
+  ScalarTypeInfo winfo_;
+  AccTypeInfo ninfo_;
+  BatchKernelConfig cfg_;
+};
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void segment_weight_norm(
+    ScalarTypeInfo& vinfo,
+    ScalarTypeInfo& ginfo,
+    ScalarTypeInfo& winfo,
+    AccTypeInfo& ninfo,
+    int dim_after_collapse) {
+  // segment reduce for statistics
+  weight_norm_reduce(vinfo, ninfo, dim_after_collapse, true);
+
+  // normalization
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+
+  using KernelClass = SegmentWeightNormKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch, problem, stride, batch * stride, problem_along_x);
+
+  KernelClass kfn(vinfo, ginfo, winfo, ninfo, cfg);
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+}
+
+template <
+    class ScalarTypeInfo,
+    class AccTypeInfo,
+    typename scalar_t,
+    typename accscalar_t,
+    typename vec_t>
+struct WeightNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+    int64_t n_lid = id.glb_batch;
+
+    int64_t g_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        n_lid,
+        ginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t n_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        n_lid,
+        ninfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t pi = id.chunk_off;
+    bi = si + bi * cfg_.problem_ * cfg_.stride_;
+
+    accscalar_t value = 0;
+    if (id.glb_batch < cfg_.problem_batch_) {
+      for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) {
+        int64_t v_lid = bi + pi_ * cfg_.stride_;
+        int64_t v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            vinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        accscalar_t v = (accscalar_t)vinfo_.data[v_off];
+        value += v * v;
+      }
+    }
+
+    if (cfg_.problem_along_x_) {
+      value = group_x_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    } else {
+      value = group_y_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    }
+
+    int n_slid = (int)id.glb_batch % batch_wg_range_;
+    if (id.glb_batch < cfg_.problem_batch_ && id.chunk_off == 0) {
+      value = sqrtf(value);
+      ninfo_.data[n_off] = value;
+      shared_[n_slid] = value;
+    }
+    // Here using slm instead. If using ugm, need fence w/
+    // order:acq_rel & scope:workgroup & space:global_mem.
+    item.barrier(sycl_local_fence);
+
+    if (id.glb_batch < cfg_.problem_batch_) {
+      for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) {
+        int64_t v_lid = bi + pi_ * cfg_.stride_;
+        int64_t v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            vinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+        int64_t w_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            winfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        winfo_.data[w_off] =
+            (1.f / shared_[n_slid]) * vinfo_.data[v_off] * ginfo_.data[g_off];
+      }
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<accscalar_t>(wg_size_, cgh);
+  }
+
+  WeightNormKernelFunctor(
+      ScalarTypeInfo vinfo,
+      ScalarTypeInfo ginfo,
+      ScalarTypeInfo winfo,
+      AccTypeInfo ninfo,
+      BatchKernelConfig cfg,
+      int wg_size,
+      int batch_wg_range)
+      : vinfo_(vinfo),
+        ginfo_(ginfo),
+        winfo_(winfo),
+        ninfo_(ninfo),
+        cfg_(cfg),
+        wg_size_(wg_size),
+        batch_wg_range_(batch_wg_range) {}
+
+ private:
+  ScalarTypeInfo vinfo_;
+  ScalarTypeInfo ginfo_;
+  ScalarTypeInfo winfo_;
+  AccTypeInfo ninfo_;
+  BatchKernelConfig cfg_;
+  int wg_size_;
+  int batch_wg_range_;
+  sycl_local_acc_t<accscalar_t> shared_;
+};
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void weight_norm(
+    ScalarTypeInfo& vinfo,
+    ScalarTypeInfo& ginfo,
+    ScalarTypeInfo& winfo,
+    AccTypeInfo& ninfo,
+    int dim_after_collapse) {
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+
+  using KernelClass = WeightNormKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t,
+      vec_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch,
+      problem,
+      stride,
+      batch * stride,
+      problem_along_x,
+      BatchKernelConfig::Policy::pLoop);
+
+  int wg_size = cfg.group_size().size();
+  int batch_wg_range = wg_size / cfg.problem_wg_range_;
+  KernelClass kfn(vinfo, ginfo, winfo, ninfo, cfg, wg_size, batch_wg_range);
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+
+  return;
+}
+
+std::tuple<Tensor, Tensor> weight_norm_kernel(
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim) {
+  TORCH_INTERNAL_ASSERT(
+      dim == 0 || dim == v.dim() - 1,
+      "fused kernels can only be applied for first or last dim");
+
+  at::ScalarType scalar_acc_t = (g.scalar_type() == at::ScalarType::Half ||
+                                 g.scalar_type() == at::ScalarType::BFloat16)
+      ? at::ScalarType::Float
+      : g.scalar_type();
+  auto norms = at::empty(
+      g.sizes(), g.options().dtype(scalar_acc_t), g.suggest_memory_format());
+  auto w = at::empty(v.sizes(), v.options(), v.suggest_memory_format());
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      v.scalar_type(),
+      "aten::weight_norm",
+      [&] {
+        auto vinfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(v);
+        int dim_after_collapse = vinfo.collapseDims(dim);
+        auto ginfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(g);
+        ginfo.collapseDims();
+
+        auto winfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(w);
+        winfo.collapseDims(dim);
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto ninfo =
+            at::xpu::detail::getTensorInfo<accscalar_t, int64_t>(norms);
+        ninfo.collapseDims();
+        dim_after_collapse = 1 - dim_after_collapse; // remain dim
+
+        int64_t batch = vinfo.outerSize(dim_after_collapse);
+        int64_t problem = vinfo.sizes[dim_after_collapse];
+        int64_t stride = vinfo.innerSize(dim_after_collapse);
+        bool problem_along_x =
+            vinfo.strides[dim_after_collapse] == 1 ? true : false;
+        if (BatchKernelConfig::Policy::pSegment ==
+            BatchKernelConfig::suggest_policy(
+                batch, problem, stride, problem_along_x)) {
+          segment_weight_norm(vinfo, ginfo, winfo, ninfo, dim_after_collapse);
+        } else {
+          weight_norm(vinfo, ginfo, winfo, ninfo, dim_after_collapse);
+        }
+      });
+
+  return {w, norms};
+}
+
+template <
+    bool is_first,
+    class ScalarType1Info,
+    class ScalarType2Info,
+    class AccTypeInfo,
+    typename scalar1_t,
+    typename scalar2_t,
+    typename accscalar_t,
+    typename vec_t>
+struct WeightNormBackwardReduceKernelFunctor
+    : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t i_pi = id.chunk * id.chunk_size + id.chunk_off;
+    int64_t o_pi = id.chunk;
+
+    int64_t i_lid =
+        si + i_pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_;
+    int64_t i1_off = at::xpu::detail::IndexToOffset<scalar1_t, int64_t>::get(
+        i_lid,
+        i1info_,
+        at::xpu::detail::IndexToOffset<scalar1_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+    int64_t i2_off;
+    if (is_first) {
+      i2_off = at::xpu::detail::IndexToOffset<scalar2_t, int64_t>::get(
+          i_lid,
+          i2info_,
+          at::xpu::detail::IndexToOffset<scalar2_t, int64_t>::
+              NON_STRICT_CONTIGUOUS);
+    }
+
+    int64_t o_lid = si + o_pi * cfg_.stride_ + bi * id.chunk_num * cfg_.stride_;
+    int64_t o_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        o_lid,
+        oinfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    accscalar_t value = 0;
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      if (is_first) {
+        auto value1 = (accscalar_t)i1info_.data[i1_off];
+        auto value2 = (accscalar_t)i2info_.data[i2_off];
+        value = value1 * value2;
+      } else {
+        value = (accscalar_t)i1info_.data[i1_off];
+      }
+    }
+
+    if (cfg_.problem_along_x_) {
+      value = group_x_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    } else {
+      value = group_y_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    }
+
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      if (id.chunk_off == 0) {
+        oinfo_.data[o_off] = value;
+      }
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<accscalar_t>(local_size_, cgh);
+  }
+  WeightNormBackwardReduceKernelFunctor(
+      ScalarType1Info i1info,
+      ScalarType2Info i2info,
+      AccTypeInfo oinfo,
+      BatchKernelConfig cfg,
+      int64_t local_size)
+      : i1info_(i1info),
+        i2info_(i2info),
+        oinfo_(oinfo),
+        cfg_(cfg),
+        local_size_(local_size) {}
+
+ private:
+  ScalarType1Info i1info_;
+  ScalarType2Info i2info_;
+  AccTypeInfo oinfo_;
+  BatchKernelConfig cfg_;
+  int64_t local_size_;
+  sycl_local_acc_t<accscalar_t> shared_;
+};
+
+template <
+    bool is_first,
+    class ScalarType1Info,
+    class ScalarType2Info,
+    class AccTypeInfo>
+static inline void launch_weight_norm_backward_reduce_kernel(
+    ScalarType1Info& i1info,
+    ScalarType2Info& i2info,
+    AccTypeInfo& oinfo,
+    BatchKernelConfig& cfg) {
+  using scalar1_t = typename ScalarType1Info::scalar_t;
+  using scalar2_t = typename ScalarType2Info::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+  WeightNormBackwardReduceKernelFunctor<
+      is_first,
+      ScalarType1Info,
+      ScalarType2Info,
+      AccTypeInfo,
+      scalar1_t,
+      scalar2_t,
+      accscalar_t,
+      vec_t>
+      kfn(i1info, i2info, oinfo, cfg, cfg.group_size().size());
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+}
+
+template <class ScalarType1Info, class ScalarType2Info, class AccTypeInfo>
+static inline void weight_norm_backward_reduce(
+    ScalarType1Info& vinfo,
+    ScalarType2Info& gwinfo,
+    AccTypeInfo& rinfo,
+    int dim_after_collapse,
+    bool is_first) {
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+
+  using scalar1_t = typename ScalarType1Info::scalar_t;
+  using scalar2_t = typename ScalarType2Info::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+  using KernelClass = WeightNormBackwardReduceKernelFunctor<
+      true,
+      ScalarType1Info,
+      ScalarType2Info,
+      AccTypeInfo,
+      scalar1_t,
+      scalar2_t,
+      accscalar_t,
+      vec_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch, problem, stride, batch * stride, problem_along_x);
+  if (cfg.problem_ <= cfg.problem_wg_range_) {
+    if (is_first) {
+      launch_weight_norm_backward_reduce_kernel<true>(
+          vinfo, gwinfo, rinfo, cfg);
+    } else {
+      launch_weight_norm_backward_reduce_kernel<false>(
+          vinfo, gwinfo, rinfo, cfg);
+    }
+    return;
+  }
+
+  Tensor carrier = at::empty(
+      {cfg.batch_, cfg.problem_glb_range_ / cfg.problem_wg_range_, cfg.stride_},
+      map_options<typename AccTypeInfo::scalar_t>());
+  auto cinfo =
+      at::xpu::detail::getTensorInfo<typename AccTypeInfo::scalar_t, int64_t>(
+          carrier);
+  if (is_first) {
+    launch_weight_norm_backward_reduce_kernel<true>(vinfo, gwinfo, cinfo, cfg);
+  } else {
+    launch_weight_norm_backward_reduce_kernel<false>(vinfo, gwinfo, cinfo, cfg);
+  }
+
+  weight_norm_backward_reduce(cinfo, gwinfo, rinfo, 1, false);
+  return;
+}
+
+template <
+    class ScalarTypeInfo,
+    class AccTypeInfo,
+    typename scalar_t,
+    typename accscalar_t>
+struct SegmentWeightNormBackwardKernelFunctor {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t pi = id.chunk * id.chunk_size + id.chunk_off;
+
+    int64_t gv_lid = si + pi * cfg_.stride_ + bi * cfg_.problem_ * cfg_.stride_;
+    int64_t gg_lid = id.glb_batch;
+
+    int64_t v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        gv_lid,
+        vinfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t gw_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        gv_lid,
+        gwinfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t gv_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        gv_lid,
+        gvinfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t g_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        gg_lid,
+        ginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t n_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        gg_lid,
+        ninfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t r_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        gg_lid,
+        rinfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    int64_t gg_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        gg_lid,
+        gginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+
+    if (id.glb_problem < cfg_.problem_ && id.glb_batch < cfg_.problem_batch_) {
+      accscalar_t g = ginfo_.data[g_off];
+      accscalar_t gw = gwinfo_.data[gw_off];
+      accscalar_t v = vinfo_.data[v_off];
+      accscalar_t n = 1.f / ninfo_.data[n_off];
+      accscalar_t r = rinfo_.data[r_off];
+      accscalar_t gg = r * n;
+      accscalar_t n3 = n * n * n;
+      accscalar_t gv = g * (n * gw - n3 * v * r);
+
+      gvinfo_.data[gv_off] = static_cast<scalar_t>(gv);
+      if (id.chunk == 0 && id.chunk_off == 0)
+        gginfo_.data[gg_off] = static_cast<scalar_t>(gg);
+    }
+  }
+  SegmentWeightNormBackwardKernelFunctor(
+      ScalarTypeInfo vinfo,
+      ScalarTypeInfo ginfo,
+      ScalarTypeInfo gwinfo,
+      AccTypeInfo ninfo,
+      ScalarTypeInfo gvinfo,
+      ScalarTypeInfo gginfo,
+      AccTypeInfo rinfo,
+      BatchKernelConfig cfg)
+      : vinfo_(vinfo),
+        ginfo_(ginfo),
+        gwinfo_(gwinfo),
+        ninfo_(ninfo),
+        gvinfo_(gvinfo),
+        gginfo_(gginfo),
+        rinfo_(rinfo),
+        cfg_(cfg) {}
+
+ private:
+  ScalarTypeInfo vinfo_;
+  ScalarTypeInfo ginfo_;
+  ScalarTypeInfo gwinfo_;
+  AccTypeInfo ninfo_;
+  ScalarTypeInfo gvinfo_;
+  ScalarTypeInfo gginfo_;
+  AccTypeInfo rinfo_;
+  BatchKernelConfig cfg_;
+};
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void segment_weight_norm_backward(
+    ScalarTypeInfo& vinfo,
+    ScalarTypeInfo& ginfo,
+    ScalarTypeInfo& gwinfo,
+    AccTypeInfo& ninfo,
+    ScalarTypeInfo& gvinfo,
+    ScalarTypeInfo& gginfo,
+    AccTypeInfo& rinfo,
+    int dim_after_collapse) {
+  // segment reduce
+  weight_norm_backward_reduce(vinfo, gwinfo, rinfo, dim_after_collapse, true);
+
+  // compute gradient
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using KernelClass = SegmentWeightNormBackwardKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch, problem, stride, batch * stride, problem_along_x);
+
+  KernelClass kfn(vinfo, ginfo, gwinfo, ninfo, gvinfo, gginfo, rinfo, cfg);
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+
+  return;
+}
+
+template <
+    class ScalarTypeInfo,
+    class AccTypeInfo,
+    typename scalar_t,
+    typename accscalar_t,
+    typename vec_t>
+struct WeightNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<2> item) const {
+    auto id = cfg_.get_item_desc(item);
+    int64_t n_lid = id.glb_batch;
+    int64_t g_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        n_lid,
+        ginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+    int64_t gg_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+        n_lid,
+        gginfo_,
+        at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+    int64_t n_off = at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::get(
+        n_lid,
+        ninfo_,
+        at::xpu::detail::IndexToOffset<accscalar_t, int64_t>::
+            NON_STRICT_CONTIGUOUS);
+    int64_t si = id.glb_batch % cfg_.stride_;
+    int64_t bi = id.glb_batch / cfg_.stride_;
+    int64_t pi = id.chunk_off;
+    bi = si + bi * cfg_.problem_ * cfg_.stride_;
+
+    accscalar_t value = 0;
+    if (id.glb_batch < cfg_.problem_batch_) {
+      for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) {
+        int64_t v_lid, v_off, gw_off;
+        v_lid = bi + pi_ * cfg_.stride_;
+
+        v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            vinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        gw_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            gwinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        accscalar_t v = (accscalar_t)vinfo_.data[v_off];
+        accscalar_t gw = (accscalar_t)gwinfo_.data[gw_off];
+        value += v * gw;
+      }
+    }
+
+    if (cfg_.problem_along_x_) {
+      value = group_x_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    } else {
+      value = group_y_reduce(
+          item, shared_, vec_t(value), ReduceAdd<accscalar_t>())[0];
+    }
+
+    int n_slid = (int)id.glb_batch % batch_wg_range_;
+    if (id.glb_batch < cfg_.problem_batch_ && id.chunk_off == 0) {
+      shared_[n_slid] = value;
+    }
+    item.barrier(sycl_local_fence);
+
+    if (id.glb_batch < cfg_.problem_batch_) {
+      for (int pi_ = pi; pi_ < cfg_.problem_; pi_ += cfg_.problem_wg_range_) {
+        int64_t v_lid, v_off, gw_off, gv_off;
+        v_lid = bi + pi_ * cfg_.stride_;
+
+        v_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            vinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        gw_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            gwinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        gv_off = at::xpu::detail::IndexToOffset<scalar_t, int64_t>::get(
+            v_lid,
+            gvinfo_,
+            at::xpu::detail::IndexToOffset<scalar_t, int64_t>::
+                NON_STRICT_CONTIGUOUS);
+
+        accscalar_t g = ginfo_.data[g_off];
+        accscalar_t gw = gwinfo_.data[gw_off];
+        accscalar_t v = vinfo_.data[v_off];
+        accscalar_t n = 1.f / ninfo_.data[n_off];
+        accscalar_t r = shared_[n_slid];
+        accscalar_t gg = r * n;
+        accscalar_t n3 = n * n * n;
+        accscalar_t gv = g * (n * gw - n3 * v * r);
+
+        gvinfo_.data[gv_off] = static_cast<scalar_t>(gv);
+        if (id.chunk_off == 0)
+          gginfo_.data[gg_off] = static_cast<scalar_t>(gg);
+      }
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<accscalar_t>(wg_size_, cgh);
+  }
+
+  WeightNormBackwardKernelFunctor(
+      ScalarTypeInfo vinfo,
+      ScalarTypeInfo ginfo,
+      ScalarTypeInfo gwinfo,
+      AccTypeInfo ninfo,
+      ScalarTypeInfo gvinfo,
+      ScalarTypeInfo gginfo,
+      BatchKernelConfig cfg,
+      int wg_size,
+      int batch_wg_range)
+      : vinfo_(vinfo),
+        ginfo_(ginfo),
+        gwinfo_(gwinfo),
+        ninfo_(ninfo),
+        gvinfo_(gvinfo),
+        gginfo_(gginfo),
+        cfg_(cfg),
+        wg_size_(wg_size),
+        batch_wg_range_(batch_wg_range) {}
+
+ private:
+  ScalarTypeInfo vinfo_;
+  ScalarTypeInfo ginfo_;
+  ScalarTypeInfo gwinfo_;
+  AccTypeInfo ninfo_;
+  ScalarTypeInfo gvinfo_;
+  ScalarTypeInfo gginfo_;
+  BatchKernelConfig cfg_;
+  int wg_size_;
+  int batch_wg_range_;
+  sycl_local_acc_t<accscalar_t> shared_;
+};
+
+template <class ScalarTypeInfo, class AccTypeInfo>
+static inline void weight_norm_backward(
+    ScalarTypeInfo& vinfo,
+    ScalarTypeInfo& ginfo,
+    ScalarTypeInfo& gwinfo,
+    AccTypeInfo& ninfo,
+    ScalarTypeInfo& gvinfo,
+    ScalarTypeInfo& gginfo,
+    int dim_after_collapse) {
+  int64_t batch = vinfo.outerSize(dim_after_collapse);
+  int64_t problem = vinfo.sizes[dim_after_collapse];
+  int64_t stride = vinfo.innerSize(dim_after_collapse);
+  bool problem_along_x = vinfo.strides[dim_after_collapse] == 1 ? true : false;
+
+  using scalar_t = typename ScalarTypeInfo::scalar_t;
+  using accscalar_t = typename AccTypeInfo::scalar_t;
+  using vec_t = at::detail::Array<accscalar_t, 1>;
+  using KernelClass = WeightNormBackwardKernelFunctor<
+      ScalarTypeInfo,
+      AccTypeInfo,
+      scalar_t,
+      accscalar_t,
+      vec_t>;
+  BatchKernelConfig cfg = BatchKernelConfig::make_config<KernelClass>(
+      batch,
+      problem,
+      stride,
+      batch * stride,
+      problem_along_x,
+      BatchKernelConfig::Policy::pLoop);
+  int wg_size = cfg.group_size().size();
+  int batch_wg_range = wg_size / cfg.problem_wg_range_;
+  KernelClass kfn(
+      vinfo,
+      ginfo,
+      gwinfo,
+      ninfo,
+      gvinfo,
+      gginfo,
+      cfg,
+      wg_size,
+      batch_wg_range);
+  sycl_kernel_submit(
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
+  return;
+}
+
+std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norms,
+    int64_t dim) {
+  auto grad_v = at::empty_like(saved_v, c10::get_contiguous_memory_format());
+  auto grad_g = at::empty_like(saved_g, c10::get_contiguous_memory_format());
+
+  at::ScalarType scalar_acc_t =
+      (saved_g.scalar_type() == at::ScalarType::Half ||
+       saved_g.scalar_type() == at::ScalarType::BFloat16)
+      ? at::ScalarType::Float
+      : saved_g.scalar_type();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      saved_v.scalar_type(),
+      "aten::weight_norm_backward",
+      [&] {
+        auto vinfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(saved_v);
+        int dim_after_collapse = vinfo.collapseDims(dim);
+
+        auto ginfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(saved_g);
+        ginfo.collapseDims();
+
+        auto gwinfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(grad_w);
+        gwinfo.collapseDims(dim);
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto ninfo =
+            at::xpu::detail::getTensorInfo<accscalar_t, int64_t>(saved_norms);
+        ninfo.collapseDims();
+
+        auto gvinfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(grad_v);
+        gvinfo.collapseDims(dim);
+
+        auto gginfo = at::xpu::detail::getTensorInfo<scalar_t, int64_t>(grad_g);
+        gginfo.collapseDims();
+
+        dim_after_collapse = 1 - dim_after_collapse; // remain dim
+
+        int64_t batch = vinfo.outerSize(dim_after_collapse);
+        int64_t problem = vinfo.sizes[dim_after_collapse];
+        int64_t stride = vinfo.innerSize(dim_after_collapse);
+        bool problem_along_x =
+            vinfo.strides[dim_after_collapse] == 1 ? true : false;
+        if (BatchKernelConfig::Policy::pSegment ==
+            BatchKernelConfig::suggest_policy(
+                batch, problem, stride, problem_along_x)) {
+          auto reduce = at::empty(
+              saved_g.sizes(),
+              saved_g.options().dtype(scalar_acc_t),
+              c10::get_contiguous_memory_format());
+          auto rinfo =
+              at::xpu::detail::getTensorInfo<accscalar_t, int64_t>(reduce);
+          rinfo.collapseDims();
+
+          segment_weight_norm_backward(
+              vinfo,
+              ginfo,
+              gwinfo,
+              ninfo,
+              gvinfo,
+              gginfo,
+              rinfo,
+              dim_after_collapse);
+        } else {
+          weight_norm_backward(
+              vinfo, ginfo, gwinfo, ninfo, gvinfo, gginfo, dim_after_collapse);
+        }
+      });
+
+  return {grad_v, grad_g};
+}
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/WeightNormKernels.h b/src/ATen/native/xpu/sycl/WeightNormKernels.h
new file mode 100644
index 000000000..5b0d7afd2
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/WeightNormKernels.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+std::tuple<Tensor, Tensor> weight_norm_kernel(
+    const Tensor& v,
+    const Tensor& g,
+    int64_t dim);
+
+std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
+    const Tensor& grad_w,
+    const Tensor& saved_v,
+    const Tensor& saved_g,
+    const Tensor& saved_norms,
+    int64_t dim);
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 5c354e0b0..2ca8e8fc1 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -694,4 +694,6 @@ supported:
   - renorm.out
   - renorm_
   - nan_to_num.out
+  - _weight_norm_interface
+  - _weight_norm_interface_backward
   - range.out