From 201a29bd8fefe8864784fe360594956db8cee1c3 Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Wed, 24 Jul 2024 08:18:22 +0800
Subject: [PATCH 1/8] Update pin_memory implementation follow stock pytorch
 change (#639)

PyTorch implemented device agnostic aten::is_pinned and
aten::_pin_memory in latest main.


https://github.com/pytorch/pytorch/commit/8963623494d096e3e5d5038aaedeb0a4fa2b3b57

Signed-off-by: majing <Jing1.Ma@intel.com>
---
 src/ATen/native/xpu/PinnedMemoryAllocator.cpp | 39 -------------------
 test/xpu/test_torch_xpu.py                    | 30 ++++----------
 yaml/xpu_functions.yaml                       |  2 -
 3 files changed, 8 insertions(+), 63 deletions(-)
 delete mode 100644 src/ATen/native/xpu/PinnedMemoryAllocator.cpp
diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
deleted file mode 100644
index 8d04d93e1..000000000
--- a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/CPUFunctions.h>
-#include <ATen/xpu/PinnedMemoryAllocator.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
-namespace at {
-
-// Note: The user must call is_pinned(device='xpu') to explicitly call here.
-bool XPUNativeFunctions::is_pinned(
-    const Tensor& self,
-    c10::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      !device.has_value() || device->type() == c10::DeviceType::XPU);
-
-  return at::detail::getXPUHooks().isPinnedPtr(self.storage().data());
-}
-
-// Note: The user must call tensor.pin_memory(device='xpu') to explicitly call
-// here.
-Tensor XPUNativeFunctions::_pin_memory(
-    const Tensor& self,
-    c10::optional<Device> device) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      !device.has_value() || device->type() == c10::DeviceType::XPU);
-
-  auto* allocator = at::xpu::getPinnedMemoryAllocator();
-  auto storage = c10::Storage(
-      c10::Storage::use_byte_size_t(),
-      at::detail::computeStorageNbytes(
-          self.sizes(), self.strides(), self.dtype().itemsize()),
-      allocator,
-      /*resizable=*/false);
-  auto tensor = at::cpu::empty({0}, self.options())
-                    .set_(storage, 0, self.sizes(), self.strides());
-  tensor.copy_(self);
-  return tensor;
-}
-
-} // namespace at
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
index afb4b28a8..b82a8ec67 100644
--- a/test/xpu/test_torch_xpu.py
+++ b/test/xpu/test_torch_xpu.py
@@ -8572,31 +8572,17 @@ def test_new(self) -> None:
         # TypeError would be better
         self.assertRaises(RuntimeError, lambda: x.new(z.storage()))
 
-    @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
     def test_pin_memory(self):
         x = torch.randn(3, 5)
         self.assertFalse(x.is_pinned())
-        if not torch.cuda.is_available() or not torch.xpu.is_available():
-            self.assertRaises(RuntimeError, lambda: x.pin_memory())
-        else:
-            if torch.xpu.is_available():
-                device = 'xpu'
-                self.assertFalse(x.is_pinned(device))
-                pinned = x.pin_memory(device)
-                self.assertTrue(pinned.is_pinned(device))
-                self.assertEqual(pinned, x)
-                self.assertNotEqual(pinned.data_ptr(), x.data_ptr())
-                # test that pin_memory on already pinned tensor has no effect
-                self.assertIs(pinned, pinned.pin_memory(device))
-                self.assertEqual(pinned.data_ptr(), pinned.pin_memory(device).data_ptr())
-            else:
-                pinned = x.pin_memory()
-                self.assertTrue(pinned.is_pinned())
-                self.assertEqual(pinned, x)
-                self.assertNotEqual(pinned.data_ptr(), x.data_ptr())
-                # test that pin_memory on already pinned tensor has no effect
-                self.assertIs(pinned, pinned.pin_memory())
-                self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr())
+        if torch.cuda.is_available() or torch.xpu.is_available():
+            pinned = x.pin_memory()
+            self.assertTrue(pinned.is_pinned())
+            self.assertEqual(pinned, x)
+            self.assertNotEqual(pinned.data_ptr(), x.data_ptr())
+            # test that pin_memory on already pinned tensor has no effect
+            self.assertIs(pinned, pinned.pin_memory())
+            self.assertEqual(pinned.data_ptr(), pinned.pin_memory().data_ptr())
 
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 54c5c31ff..cd6f80b84 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -539,8 +539,6 @@ supported:
   - sgn.out
   - sgn_
   - _cdist_forward
-  - _pin_memory
-  - is_pinned
   - is_set_to
   - bucketize.Tensor
   - bucketize.Tensor_out

From fbd095a57bb146c1360e2dfe5c8cd7ef615567ae Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Tue, 23 Jul 2024 20:04:00 -0700
Subject: [PATCH 2/8] Drop CPU fallback for aten::norm.out and
 aten::nextafter.out (#641)

Fixes: #640
Fixes: 6eca394 ("Add aten::nextafter and its variants")

CC: @fengyuan14 @yucai-intel @xytintel

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 src/ATen/native/xpu/XPUFallback.template | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index d5ae19d5a..52b417415 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -239,8 +239,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "nanmedian",
     "nanmedian.dim_values",
     "nansum",
-    "norm.out",
-    "nextafter.out",
     "ormqr",
     "_pdist_backward",
     "_pdist_forward",

From 07ebadbd582305ca4dac2759358e8be651979df4 Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Wed, 24 Jul 2024 11:05:17 +0800
Subject: [PATCH 3/8] Update the comments of the radix sort kernel (#633)

Update the comments of the radix sort kernel.

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/sycl/SortingKernels.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/SortingKernels.h b/src/ATen/native/xpu/sycl/SortingKernels.h
index fb29a9f53..b39595143 100644
--- a/src/ATen/native/xpu/sycl/SortingKernels.h
+++ b/src/ATen/native/xpu/sycl/SortingKernels.h
@@ -539,9 +539,7 @@ void segmented_sort_pairs_(
     int num_elements) {
   constexpr int scaling_coef = sizeof(key_t) * sizeof(value_t) >= 64
       ? 2
-      : 1; // Attempt to reduce register pressure. The result will be incorrect
-           // when using too many local variables (registers).
-           // https://github.com/intel/torch-xpu-ops/issues/626
+      : 1; // Attempt to reduce register pressure for performance.
   if (num_elements > 4096 / scaling_coef) {
     // Considering register pressure, we use a problem size of 4096 to delineate
     // the boundary between single tile sort and group sort.

From f716a582852f562e2ad26a0381885b0509168fe9 Mon Sep 17 00:00:00 2001
From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com>
Date: Wed, 24 Jul 2024 13:54:54 +0800
Subject: [PATCH 4/8] Add aten::repeat_interleave (#564)

![image](https://github.com/intel/torch-xpu-ops/assets/105194461/1fe5b8bc-c82a-41eb-b673-7c5cb6f5d3cf)

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/ReduceOps.cpp         |  1 -
 src/ATen/native/xpu/Repeat.cpp            | 10 +++
 src/ATen/native/xpu/XPUFallback.template  |  1 -
 src/ATen/native/xpu/sycl/RepeatKernel.cpp | 79 +++++++++++++++++++++++
 src/ATen/native/xpu/sycl/RepeatKernel.h   |  9 +++
 test/xpu/xpu_test_utils.py                |  1 +
 yaml/xpu_functions.yaml                   |  1 +
 7 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 src/ATen/native/xpu/Repeat.cpp
 create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.cpp
 create mode 100644 src/ATen/native/xpu/sycl/RepeatKernel.h

diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp
index 99069eefb..826b1e27b 100644
--- a/src/ATen/native/xpu/ReduceOps.cpp
+++ b/src/ATen/native/xpu/ReduceOps.cpp
@@ -47,7 +47,6 @@ static void cum_ops_meta(
   maybe_wrap_dim(dim, self.dim());
 
   ScalarType out_dtype;
-
   if (result.defined()) {
     out_dtype = dtype.value_or(result.scalar_type());
     at::xpu::resize_out(
diff --git a/src/ATen/native/xpu/Repeat.cpp b/src/ATen/native/xpu/Repeat.cpp
new file mode 100644
index 000000000..38e5ae8da
--- /dev/null
+++ b/src/ATen/native/xpu/Repeat.cpp
@@ -0,0 +1,10 @@
+#include <ATen/ATen.h>
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+namespace at {
+Tensor XPUNativeFunctions::repeat_interleave(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size) {
+  return at::native::xpu::repeat_interleave_kernel(repeats, output_size);
+}
+} // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 52b417415..356a45f70 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -248,7 +248,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "prod",
     "prod.int_out",
     "put_",
-    "repeat_interleave.Tensor",
     "replication_pad1d_backward.grad_input",
     "replication_pad1d.out",
     "replication_pad2d_backward",
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.cpp b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
new file mode 100644
index 000000000..c3e27ab62
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.cpp
@@ -0,0 +1,79 @@
+#include <ATen/ATen.h>
+#include <ATen/native/Repeat.h>
+#include <ATen/native/xpu/sycl/RepeatKernel.h>
+#include <comm/SYCLContext.h>
+namespace at::native::xpu {
+template <typename index_t>
+struct RepeatInterleaveKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    auto rep_ptr = rep_data_;
+    auto cum_ptr = cum_data_;
+    auto res_ptr = res_data_;
+
+    for (int64_t i = item.get_global_id(0); i < size_;
+         i += item.get_global_range()[0]) {
+      int64_t end = cum_ptr[i];
+      int64_t repeat = rep_ptr[i];
+      int64_t start = end - repeat;
+      for (int64_t j = start; j < end; j++) {
+        res_ptr[j] = i;
+      }
+    }
+  }
+  RepeatInterleaveKernelFunctor(
+      const index_t* rep_data,
+      const int64_t* cum_data,
+      index_t* res_data,
+      int64_t size,
+      int64_t result_size)
+      : rep_data_(rep_data),
+        cum_data_(cum_data),
+        res_data_(res_data),
+        size_(size),
+        result_size_(result_size) {}
+
+ private:
+  const index_t* rep_data_;
+  const int64_t* cum_data_;
+  index_t* res_data_;
+  int64_t size_;
+  int64_t result_size_;
+};
+
+template <typename index_t>
+static void compute_xpu(
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
+    index_t* result_ptr,
+    int64_t size,
+    int64_t result_size) {
+  if (size == 0)
+    return;
+
+  auto kfn = RepeatInterleaveKernelFunctor<index_t>(
+      repeat_ptr,
+      cumsum_ptr,
+      result_ptr,
+      size,
+      result_size);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int64_t local_range = size < wg_size ? size : wg_size;
+  int64_t global_range = ((size + local_range - 1) / local_range) * local_range;
+
+  auto queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
+}
+
+Tensor repeat_interleave_kernel(
+    const Tensor& repeat,
+    c10::optional<int64_t> output_size) {
+  Tensor output;
+
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_xpu", [&] {
+    output = repeat_interleave_common<index_t, compute_xpu<index_t>>(
+        repeat, output_size);
+  });
+  return output;
+}
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/RepeatKernel.h b/src/ATen/native/xpu/sycl/RepeatKernel.h
new file mode 100644
index 000000000..3df41a956
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/RepeatKernel.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <ATen/ATen.h>
+namespace at::native::xpu {
+
+Tensor repeat_interleave_kernel(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 3e3b5d4d1..96e43058a 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -209,6 +209,7 @@
     "aminmax",
     "argmin",
     "conj_physical",
+    "repeat_interleave",
     "fmax",
     "fmin",
     "floor",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index cd6f80b84..48ddc4c3f 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -660,6 +660,7 @@ supported:
   - ceil
   - ceil_
   - ceil.out
+  - repeat_interleave.Tensor
   - norm.ScalarOpt_dim_dtype
   - norm.dtype_out
   - norm.ScalarOpt_dim

From d25d3d09a2a01867bf613f9cc535c596832eb2e8 Mon Sep 17 00:00:00 2001
From: yuchengliu1 <yucheng.liu@intel.com>
Date: Wed, 24 Jul 2024 14:42:30 +0800
Subject: [PATCH 5/8] pow_Tensor_Scalar: fixing error result when scalar exp is
 a bool. (#638)

Fixing TestBinaryUfuncsXPU.test_pow_xpu_int

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Pow.cpp    | 8 +++++++-
 test/xpu/run_test_with_skip.py | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/Pow.cpp b/src/ATen/native/xpu/Pow.cpp
index df335bfa1..97dc5a0c2 100644
--- a/src/ATen/native/xpu/Pow.cpp
+++ b/src/ATen/native/xpu/Pow.cpp
@@ -58,7 +58,13 @@ Tensor& XPUNativeFunctions::pow_out(
 Tensor XPUNativeFunctions::pow(const Tensor& self, const Scalar& exponent) {
   Tensor out;
   auto iter = pow_tensor_scalar_meta(self, exponent, out);
-  native::xpu::pow_tensor_scalar_kernel(iter, exponent);
+  if (exponent.equal(0.0) || exponent.equal(false)) {
+    iter.output().fill_(1);
+  } else if (exponent.equal(1.0) || exponent.equal(true)) {
+    iter.output().copy_(self);
+  } else {
+    native::xpu::pow_tensor_scalar_kernel(iter, exponent);
+  }
   return iter.output();
 }
 
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 0e00da513..3ffc3feb6 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -797,12 +797,13 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 skip_list = (
     "test_fmod_remainder_by_zero_integral_xpu_int64",  # zero division is an undefined behavior: different handles on different backends
     "test_div_rounding_numpy_xpu_float16",  # Calculation error. XPU implementation uses opmath type.
-    # RuntimeError: false INTERNAL ASSERT FAILED at "torch-xpu-ops/src/ATen/native/xpu/sycl/PowKernels.cpp":233, please report a bug to PyTorch. invalid combination of type in Pow function, common dtype: Short, exp is integral? 0
+    # fail in complex_exponents=[-1.0 - 1.5j, 3.3j]
+    # Mismatched elements: 33 / 100 (33.0%)
+    # Greatest absolute difference: 0.00038337233127094805 at index (4,) (up to 1e-05 allowed)
+    # Greatest relative difference: 1.9085073290625587e-06 at index (6,) (up to 1.3e-06 allowed)
     "test_pow_xpu_int16",
     "test_pow_xpu_int32",
     "test_pow_xpu_int64",
-    "test_pow_xpu_int8",
-    "test_pow_xpu_uint8",
     # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
     "_jiterator_",
     # Unexpected success

From e1e195bd4d6f9541fc05c54579242f07f30a63b3 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <106960996+guangyey@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:55:52 +0800
Subject: [PATCH 6/8] Support XPU ABI=0 build (#531)

# Motivation
Support XPU ABI neutral build.
Starting from compiler 2025.0, `libsycl.so` is ABI-neutral lib. Before
this, the `libsycl-preview.so` is ABI-neutral.

refer to https://github.com/pytorch/pytorch/pull/130110
---
 cmake/BuildFlags.cmake                  |  6 +-
 cmake/Modules/FindSYCL.cmake            |  5 +-
 cmake/Modules/FindSYCLToolkit.cmake     | 73 ++++++-------------------
 cmake/SYCL.cmake                        | 18 ------
 src/ATen/CMakeLists.txt                 |  2 +-
 src/ATen/native/xpu/sycl/AmpKernels.cpp |  4 +-
 src/CMakeLists.txt                      |  2 +-
 test/sycl/CMakeLists.txt                |  2 +-
 8 files changed, 28 insertions(+), 84 deletions(-)

diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index d0eb3e299..2f05d1ab0 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -59,10 +59,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
+    # Equivalent to build option -fpreview-breaking-changes for SYCL compiler.
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
   endif()
-  # TODO: Align with PyTorch and switch to ABI=0 eventually, after
-  # resolving incompatible implementation in SYCL runtime.
-  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=1)
   set(SYCL_FLAGS ${SYCL_FLAGS} ${SYCL_KERNEL_OPTIONS})
 
   set(TORCH_XPU_OPS_FLAGS ${SYCL_HOST_FLAGS})
diff --git a/cmake/Modules/FindSYCL.cmake b/cmake/Modules/FindSYCL.cmake
index b8fc8ba22..4e266a26f 100644
--- a/cmake/Modules/FindSYCL.cmake
+++ b/cmake/Modules/FindSYCL.cmake
@@ -407,7 +407,6 @@ macro(SYCL_LINK_DEVICE_OBJECTS output_file sycl_target)
       OUTPUT ${output_file}
       DEPENDS ${object_files}
       COMMAND ${SYCL_EXECUTABLE}
-      -fsycl
       ${SYCL_device_link_flags}
       -fsycl-link ${object_files}
       -Xs "\"${SYCL_OFFLINE_COMPILER_FLAGS}\""
@@ -471,7 +470,7 @@ macro(SYCL_ADD_LIBRARY sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
-    ${SYCL_LIBRARIES})
+    ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
     PROPERTIES
@@ -530,7 +529,7 @@ macro(SYCL_ADD_EXECUTABLE sycl_target)
   target_link_libraries(
     ${sycl_target}
     ${SYCL_LINK_LIBRARIES_KEYWORD}
-    ${SYCL_LIBRARIES})
+    ${SYCL_LIBRARY})
 
   set_target_properties(${sycl_target}
     PROPERTIES
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
index e478cb4c5..46e34c7f8 100644
--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -25,16 +25,20 @@ This will define the following variables:
 
 #]=======================================================================]
 
-set(SYCLTOOLKIT_FOUND False)
-include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+include(${TORCH_ROOT}/cmake/Modules/FindSYCLToolkit.cmake)
 
-set(SYCL_ROOT "")
-if(DEFINED ENV{SYCL_ROOT})
-  set(SYCL_ROOT $ENV{SYCL_ROOT})
-elseif(DEFINED ENV{CMPLR_ROOT})
-  set(SYCL_ROOT $ENV{CMPLR_ROOT})
+if(NOT SYCL_FOUND)
+  set(SYCLTOOLKIT_FOUND FALSE)
+  return()
 endif()
 
+if(SYCLTOOLKIT_FOUND)
+  return()
+endif()
+set(SYCLTOOLKIT_FOUND TRUE)
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+
 if(WIN32)
   set(SYCL_EXECUTABLE_NAME icx)
 else()
@@ -71,43 +75,6 @@ if(nocmplr)
   set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
 endif()
 
-find_file(
-  SYCL_INCLUDE_DIR
-  NAMES include
-  HINTS ${SYCL_ROOT}
-  NO_DEFAULT_PATH
-  )
-
-find_file(
-  SYCL_INCLUDE_SYCL_DIR
-  NAMES sycl
-  HINTS ${SYCL_ROOT}/include
-  NO_DEFAULT_PATH
-  )
-
-list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR})
-
-find_file(
-  SYCL_LIBRARY_DIR
-  NAMES lib lib64
-  HINTS ${SYCL_ROOT}
-  NO_DEFAULT_PATH
-  )
-
-find_library(
-  SYCL_LIBRARY
-  NAMES sycl
-  HINTS ${SYCL_LIBRARY_DIR}
-  NO_DEFAULT_PATH
-)
-
-if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY))
-  set(SYCLTOOLKIT_FOUND False)
-  set(SYCL_REASON_FAILURE "SYCL sdk is incomplete!!")
-  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
-  return()
-endif()
-
 # Function to write a test case to verify SYCL features.
 
 function(SYCL_CMPLR_TEST_WRITE src)
@@ -202,6 +169,13 @@ set(SYCL_FLAGS "")
 set(SYCL_LINK_FLAGS "")
 list(APPEND SYCL_FLAGS "-fsycl")
 list(APPEND SYCL_LINK_FLAGS "-fsycl")
+if(LINUX)
+  string(REGEX MATCH "libsycl-preview.so" is_abi_neutral ${SYCL_LIBRARY})
+  if(is_abi_neutral)
+    list(APPEND SYCL_FLAGS "-fpreview-breaking-changes")
+    list(APPEND SYCL_LINK_FLAGS "-fpreview-breaking-changes")
+  endif()
+endif()
 
 set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}")
 
@@ -249,14 +223,3 @@ endif()
 
 message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
 message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
-
-# Avoid module variables conflict due to calling find_package recursively
-# e.g. find_package -> add_subdirectory(entering in a sub-project) -> find_package
-# find_package_handle_standard_args(
-#   SYCLToolkit
-#   FOUND_VAR SYCLTOOLKIT_FOUND
-#   REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY SYCL_FLAGS
-#   VERSION_VAR SYCL_LANGUAGE_VERSION
-#   REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}")
-set(SYCLTOOLKIT_FOUND True)
-
diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake
index b2bbadaa4..f0724ef55 100644
--- a/cmake/SYCL.cmake
+++ b/cmake/SYCL.cmake
@@ -25,24 +25,6 @@ if(NOT SYCL_VERSION)
   return()
 endif()
 
-find_library(SYCL_LIBRARIES sycl HINTS ${SYCL_LIBRARY_DIR})
-# On Windows, currently there's no sycl.lib. Only sycl7.lib with version suffix,
-# where the current version of the SYCL runtime is 7.
-# Until oneAPI adds support to sycl.lib without the version suffix,
-# sycl_runtime_version needs to be hardcoded and uplifted when SYCL runtime version uplifts.
-# TODO: remove this when sycl.lib is supported on Windows
-if(WIN32)
-  set(sycl_runtime_version 7)
-  find_library(
-    SYCL_LIBRARIES
-    NAMES "sycl${sycl_runtime_version}"
-    HINTS ${SYCL_LIBRARY_DIR}
-  )
-  if(SYCL_LIBRARIES STREQUAL "SYCL_LIBRARIES-NOTFOUND")
-    message(FATAL_ERROR "Cannot find a SYCL library on Windows")
-  endif()
-endif()
-
 set(SYCL_COMPILER_VERSION)
 file(READ ${SYCL_VERSION} version_contents)
 string(REGEX MATCHALL "__SYCL_COMPILER_VERSION +[0-9]+" VERSION_LINE "${version_contents}")
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index c95e860fa..815ad018f 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ATen XPU sources
 
-file(GLOB xpu_cpp "xpu/*.cpp", "native/xpu/*.cpp", "native/sparse/*.cpp")
+file(GLOB xpu_cpp "xpu/*.cpp" "native/xpu/*.cpp" "native/sparse/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
diff --git a/src/ATen/native/xpu/sycl/AmpKernels.cpp b/src/ATen/native/xpu/sycl/AmpKernels.cpp
index 5e8d258a6..87354eea9 100644
--- a/src/ATen/native/xpu/sycl/AmpKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AmpKernels.cpp
@@ -102,9 +102,9 @@ void amp_foreach_non_finite_check_and_unscale_kernel(
 }
 
 struct AmpUpdateScaleKernelFunctor {
-  void operator()(sycl::item<1> item) const {
+  void operator()(sycl::nd_item<1> item) const {
     // There is only single item/task scheduled.
-    if (item.get_linear_id() != 0)
+    if (item.get_global_linear_id() != 0)
       return;
 
     if (*found_inf_) {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5e0a8549..fa17491a2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,7 +82,7 @@ foreach(lib ${TORCH_XPU_OPS_LIBRARIES})
   target_include_directories(${lib} PUBLIC ${ATen_XPU_INCLUDE_DIRS})
   target_include_directories(${lib} PUBLIC ${SYCL_INCLUDE_DIR})
 
-  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARIES})
+  target_link_libraries(${lib} PUBLIC ${SYCL_LIBRARY})
 endforeach()
 
 include(${TORCH_XPU_OPS_ROOT}/cmake/ClangFormat.cmake)
diff --git a/test/sycl/CMakeLists.txt b/test/sycl/CMakeLists.txt
index 0671fbc94..5bbee8643 100644
--- a/test/sycl/CMakeLists.txt
+++ b/test/sycl/CMakeLists.txt
@@ -42,7 +42,7 @@ add_dependencies(test_sycl_build_archive sycl_simple_kernel_test)
 # Instead, we use explicit linkage option '--whole-archive', which is required
 # by linkage of device object modules archived in the static library. Then
 # explicit linkage configuration of SYCL runtime library is required.
-target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARIES})
+target_link_libraries(test_sycl_build_archive ${SYCL_LIBRARY})
 
 if(INSTALL_TEST)
   install(TARGETS test_sycl_build_archive DESTINATION bin)

From fcaa34cb5c740c81fab2126b0a68e5f4c724324a Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Thu, 25 Jul 2024 08:45:49 +0800
Subject: [PATCH 7/8] Add aten::histogram and variant (#569)

CPU/MPS only ops

---------

Signed-off-by: majing <Jing1.Ma@intel.com>
Signed-off-by: Ma, Jing1 <jing1.ma@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Histogram.cpp             | 195 +++++++++++++++
 src/ATen/native/xpu/sycl/HistogramKernels.h   |  23 ++
 .../native/xpu/sycl/HistogramddKernels.cpp    | 227 ++++++++++++++++++
 test/xpu/extended/run_test_with_skip.py       |   5 +
 test/xpu/extended/test_ops_xpu.py             |   6 +
 test/xpu/run_test_with_skip.py                |   4 +
 test/xpu/xpu_test_utils.py                    |   1 +
 yaml/xpu_functions.yaml                       |   4 +
 8 files changed, 465 insertions(+)
 create mode 100644 src/ATen/native/xpu/Histogram.cpp
 create mode 100644 src/ATen/native/xpu/sycl/HistogramKernels.h
 create mode 100644 src/ATen/native/xpu/sycl/HistogramddKernels.cpp

diff --git a/src/ATen/native/xpu/Histogram.cpp b/src/ATen/native/xpu/Histogram.cpp
new file mode 100644
index 000000000..de49952b2
--- /dev/null
+++ b/src/ATen/native/xpu/Histogram.cpp
@@ -0,0 +1,195 @@
+#include <ATen/Context.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/xpu/sycl/HistogramKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+namespace at {
+
+/* Checks properties of input tensors input, bins, and weight.
+ */
+void histogramdd_check_inputs(
+    const Tensor& input,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight) {
+  if (weight.has_value()) {
+    TORCH_CHECK(
+        weight->device() == input.device(),
+        "weight and input need to be on the same device.")
+  }
+  auto input_dtype = input.dtype();
+  auto bins_dtype = bins.dtype();
+  TORCH_CHECK(
+      input_dtype == bins_dtype,
+      "torch.histogramdd: input tensor and bins tensors should",
+      " have the same dtype, but got input with dtype ",
+      input_dtype,
+      " and bins with dtype ",
+      bins_dtype);
+
+  const int64_t bins_dim = bins.dim();
+  TORCH_CHECK(
+      bins_dim == 1,
+      "torch.histogramdd: bins tensor should have one dimension,",
+      " but got ",
+      bins_dim,
+      " dimensions in the bin tensor");
+
+  const int64_t numel = bins.numel();
+  TORCH_CHECK(
+      numel > 0,
+      "torch.histogramdd: bins tensor should have at least 1 element,",
+      " but got ",
+      numel,
+      " elements in the bin tensor");
+
+  if (weight.has_value()) {
+    TORCH_CHECK(
+        input.dtype() == weight.value().dtype(),
+        "torch.histogramdd: if weight tensor is provided, ",
+        "input tensor and weight tensor should have the same dtype, ",
+        "but got input(",
+        input.dtype(),
+        ")",
+        ", and weight(",
+        weight.value().dtype(),
+        ")");
+
+    auto input_sizes = input.sizes().vec();
+
+    auto weight_sizes = weight.value().sizes().vec();
+    if (weight_sizes.empty()) {
+      // correctly handle scalars
+      weight_sizes = {1};
+    }
+
+    TORCH_CHECK(
+        input_sizes == weight_sizes,
+        "torch.histogramdd: if weight tensor is provided it should have",
+        " the same shape as the input tensor excluding its innermost ",
+        "dimension, but got input with shape ",
+        input.sizes(),
+        " and weight ",
+        "with shape ",
+        weight.value().sizes());
+  }
+}
+
+/* Checks properties of output tensors hist and bin_edges, then resizes them.
+ */
+void histogramdd_prepare_out(
+    const Tensor& input,
+    int64_t bin_ct,
+    const Tensor& hist,
+    const Tensor& bin_edges) {
+  TORCH_CHECK(
+      input.dtype() == hist.dtype(),
+      "torch.histogram: input tensor and hist tensor should",
+      " have the same dtype, but got input ",
+      input.dtype(),
+      " and hist ",
+      hist.dtype());
+
+  TORCH_CHECK(
+      input.dtype() == bin_edges.dtype(),
+      "torch.histogram: input tensor and bin_edges tensor should",
+      " have the same dtype, but got input ",
+      input.dtype(),
+      " and bin_edges ",
+      bin_edges.dtype());
+
+  TORCH_CHECK(
+      bin_ct > 0, "torch.histogram(): bins must be > 0, but got ", bin_ct);
+
+  at::native::resize_output(bin_edges, {bin_ct + 1});
+
+  at::native::resize_output(hist, {bin_ct});
+}
+
+void histogramdd_prepare_out(
+    const Tensor& input,
+    const Tensor& bins,
+    const Tensor& hist,
+    const Tensor& bin_edges) {
+  int64_t bin_ct = bins.numel() - 1;
+  histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
+}
+
+static Tensor& histogramdd_out(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  histogramdd_check_inputs(self, bins, weight);
+  histogramdd_prepare_out(self, bins, hist, bin_edges);
+
+  bin_edges.copy_(bins);
+
+  at::native::xpu::histogramdd_kernel(self, weight, density, hist, bin_edges);
+  return hist;
+}
+
+std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  Tensor reshaped_self = self.reshape({self.numel()});
+  std::optional<Tensor> reshaped_weight = weight.has_value()
+      ? weight.value().reshape({weight.value().numel()})
+      : weight;
+
+  histogramdd_out(
+      reshaped_self, bins, reshaped_weight, density, hist, bin_edges);
+
+  return std::forward_as_tuple(hist, bin_edges);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
+    const Tensor& self,
+    const Tensor& bins,
+    const std::optional<Tensor>& weight,
+    bool density) {
+  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
+  Tensor bin_edges = at::empty({0}, bins.options(), MemoryFormat::Contiguous);
+  return histogram_out(self, bins, weight, density, hist, bin_edges);
+}
+
+std::tuple<Tensor&, Tensor&> XPUNativeFunctions::histogram_out(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& bin_edges) {
+  Tensor reshaped_self = self.reshape({self.numel()});
+  std::optional<Tensor> reshaped_weight = weight.has_value()
+      ? weight.value().reshape({weight.value().numel()})
+      : weight;
+
+  histogramdd_prepare_out(reshaped_self, bin_ct, hist, bin_edges);
+  histogramdd_check_inputs(reshaped_self, bin_edges, reshaped_weight);
+
+  at::native::xpu::histogramdd_linear_kernel(
+      reshaped_self, bin_ct, range, reshaped_weight, density, hist, bin_edges);
+  return std::forward_as_tuple(hist, bin_edges);
+}
+
+std::tuple<Tensor, Tensor> XPUNativeFunctions::histogram(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density) {
+  Tensor hist = at::empty({0}, self.options(), MemoryFormat::Contiguous);
+  Tensor bin_edges_out = at::empty({0}, self.options());
+  return histogram_out(
+      self, bin_ct, range, weight, density, hist, bin_edges_out);
+}
+
+} // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/HistogramKernels.h b/src/ATen/native/xpu/sycl/HistogramKernels.h
new file mode 100644
index 000000000..da153186a
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/HistogramKernels.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+
+void histogramdd_kernel(
+    const Tensor& self,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    const Tensor& bin_edges);
+
+void histogramdd_linear_kernel(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& out_bin_edges);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/HistogramddKernels.cpp b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
new file mode 100644
index 000000000..be888e4b4
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/HistogramddKernels.cpp
@@ -0,0 +1,227 @@
+#pragma clang diagnostic push
+#pragma GCC diagnostic push
+// Avoid SYCL compiler return-type error
+#pragma clang diagnostic ignored "-Wreturn-type"
+#pragma GCC diagnostic ignored "-Wreturn-type"
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/xpu/sycl/Atomics.h>
+#include <comm/SYCLContext.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/linspace.h>
+#endif
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct HistogramddKernelFunctor {
+  void operator()(sycl::nd_item<1> item_id) const {
+    int64_t wi_id = item_id.get_global_id();
+    if (wi_id < input_size_ * bin_size_) {
+      int64_t ele_idx = wi_id / bin_size_;
+      int64_t bin_idx = wi_id % bin_size_;
+
+      // [left, right)
+      if (input_[ele_idx] >= bin_edges_[bin_idx] &&
+          input_[ele_idx] < bin_edges_[bin_idx + 1]) {
+        scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin_idx), value);
+        return;
+      }
+
+      // For last bin, [left, right]
+      if (bin_idx == 0 && input_[ele_idx] == bin_edges_[bin_size_]) {
+        scalar_t value = weight_ ? weight_[ele_idx] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin_size_ - 1), value);
+      }
+    }
+  }
+
+  HistogramddKernelFunctor(
+      const scalar_t* input,
+      const scalar_t* bin_edges,
+      scalar_t* hist,
+      const scalar_t* weight,
+      int64_t input_size,
+      int64_t bin_size)
+      : input_(input),
+        bin_edges_(bin_edges),
+        hist_(hist),
+        weight_(weight),
+        input_size_(input_size),
+        bin_size_(bin_size) {}
+
+ private:
+  const scalar_t* input_;
+  const scalar_t* bin_edges_;
+  scalar_t* hist_;
+  const scalar_t* weight_;
+  int64_t input_size_;
+  int64_t bin_size_;
+};
+
+// For one dimension case
+template <typename scalar_t>
+void histogramdd_template(
+    const scalar_t* input,
+    const scalar_t* bin_edges,
+    scalar_t* hist,
+    const scalar_t* weight,
+    int64_t input_size,
+    int64_t bin_size) {
+  HistogramddKernelFunctor<scalar_t> kfn(
+      input, bin_edges, hist, weight, input_size, bin_size);
+  const int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg =
+      (input_size * bin_size + work_group_size - 1) / work_group_size;
+  sycl_kernel_submit(
+      num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn);
+}
+
+template <typename scalar_t>
+struct HistogramddLinearKernelFunctor {
+  void operator()(sycl::nd_item<1> item_id) const {
+    int64_t wi_id = item_id.get_global_id();
+    if (wi_id < input_size_) {
+      scalar_t i_value = input_[wi_id];
+      if (i_value >= leftmost_edge_ && i_value <= rightmost_edge_) {
+        int64_t bin =
+            (int64_t)(((i_value - leftmost_edge_)) * bin_size_ / (rightmost_edge_ - leftmost_edge_));
+        if (bin == bin_size_)
+          bin -= 1;
+        scalar_t value = weight_ ? weight_[wi_id] : (scalar_t)1;
+        atomicAdd((sycl_global_ptr<scalar_t>)(hist_ + bin), value);
+      }
+    }
+  }
+
+  HistogramddLinearKernelFunctor(
+      const scalar_t* input,
+      scalar_t* hist,
+      const scalar_t* weight,
+      int64_t input_size,
+      int64_t bin_size,
+      double leftmost_edge,
+      double rightmost_edge)
+      : input_(input),
+        hist_(hist),
+        weight_(weight),
+        input_size_(input_size),
+        bin_size_(bin_size),
+        leftmost_edge_(leftmost_edge),
+        rightmost_edge_(rightmost_edge) {}
+
+ private:
+  const scalar_t* input_;
+  scalar_t* hist_;
+  const scalar_t* weight_;
+  int64_t input_size_;
+  int64_t bin_size_;
+  double leftmost_edge_;
+  double rightmost_edge_;
+};
+
+// For one dimension case
+template <typename scalar_t>
+void histogramdd_linear_template(
+    const scalar_t* input,
+    scalar_t* hist,
+    const scalar_t* weight,
+    int64_t input_size,
+    int64_t bin_size,
+    double leftmost_edge,
+    double rightmost_edge) {
+  HistogramddLinearKernelFunctor<scalar_t> kfn(
+      input, hist, weight, input_size, bin_size, leftmost_edge, rightmost_edge);
+  const int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (input_size + work_group_size - 1) / work_group_size;
+  sycl_kernel_submit(
+      num_wg * work_group_size, work_group_size, getCurrentSYCLQueue(), kfn);
+}
+
+void histogramdd_kernel(
+    const Tensor& self,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    const Tensor& bin_edges_) {
+  globalContext().alertNotDeterministic("histogramdd_kernel_xpu");
+  hist.fill_(0);
+  Tensor bin_edges = bin_edges_.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, self.scalar_type(), "histogram_xpu", [&]() {
+        histogramdd_template<scalar_t>(
+            self.data_ptr<scalar_t>(),
+            bin_edges.data_ptr<scalar_t>(),
+            hist.data_ptr<scalar_t>(),
+            weight.has_value() ? weight->data_ptr<scalar_t>() : nullptr,
+            self.numel(),
+            bin_edges.numel() - 1);
+      });
+
+  if (density) {
+    const auto hist_sum = hist.sum();
+    hist.div_(hist_sum);
+    Tensor bin_lengths = bin_edges.diff();
+    hist.div_(bin_lengths);
+  }
+}
+
+void histogramdd_linear_kernel(
+    const Tensor& self,
+    int64_t bin_ct,
+    std::optional<c10::ArrayRef<double>> range,
+    const std::optional<Tensor>& weight,
+    bool density,
+    Tensor& hist,
+    Tensor& out_bin_edges) {
+  globalContext().alertNotDeterministic("histogramdd_linear_kernel_xpu");
+  hist.fill_(0);
+
+  // default range for empty input
+  double leftmost_edge = 0., rightmost_edge = 1.;
+  if (range.has_value()) {
+    leftmost_edge = range.value()[0];
+    rightmost_edge = range.value()[1];
+  } else if (self.numel() > 0) {
+    auto extrema = at::aminmax(self);
+    leftmost_edge = std::get<0>(extrema).item<double>();
+    rightmost_edge = std::get<1>(extrema).item<double>();
+  }
+
+  if (leftmost_edge == rightmost_edge) {
+    leftmost_edge -= 0.5;
+    rightmost_edge += 0.5;
+  }
+
+  at::linspace_out(out_bin_edges, leftmost_edge, rightmost_edge, bin_ct + 1);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, self.scalar_type(), "histogram_linear_xpu", [&]() {
+        histogramdd_linear_template<scalar_t>(
+            self.data_ptr<scalar_t>(),
+            hist.data_ptr<scalar_t>(),
+            weight.has_value() ? weight->data_ptr<scalar_t>() : nullptr,
+            self.numel(),
+            bin_ct,
+            leftmost_edge,
+            rightmost_edge);
+      });
+
+  if (density) {
+    const auto hist_sum = hist.sum();
+    hist.div_(hist_sum);
+    Tensor bin_lengths = out_bin_edges.diff();
+    hist.div_(bin_lengths);
+  }
+}
+
+} // namespace at::native::xpu
+
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
\ No newline at end of file
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 76790d817..a05d47e05 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -163,6 +163,11 @@
     "test_compare_cpu_std_mean_xpu_bfloat16",
     "test_compare_cpu_sub_xpu_float16",
     "test_compare_cpu_var_mean_xpu_bfloat16",
+
+    # test case doesn't make sense, will file an issue to track it.
+    # https://github.com/pytorch/pytorch/issues/130916
+    "test_compare_cpu_histogram_xpu_float32",
+    "test_compare_cpu_histogram_xpu_float64",
 )
 
 
diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py
index 792c7c569..1d9d3df6d 100644
--- a/test/xpu/extended/test_ops_xpu.py
+++ b/test/xpu/extended/test_ops_xpu.py
@@ -81,6 +81,12 @@ def test_compare_cpu(self, device, dtype, op):
             self.proxy = Namespace.TestCommonProxy()
             test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
             test_common_test_fn(self.proxy, device, dtype, op)
+        # for CUDA doesn't support operators
+        elif (op.name in ["histogram",]):
+            if dtype in op.dtypes:
+                self.proxy = Namespace.TestCommonProxy()
+                test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
+                test_common_test_fn(self.proxy, device, dtype, op)
         else:
             pytest.skip(f"{op.name} has not supported {dtype} yet both for cpu and xpu")
 
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 3ffc3feb6..313a2e8b3 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -783,6 +783,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # torch.complex32 - "sinh_cpu" not implemented for 'ComplexHalf'
     "test_dtypes_cosh_xpu",
 
+    # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
+    # but test_dtypes infrastructure leverage CUDA supported datatypes
+    "test_dtypes_histogram_xpu",
+
     # The following dtypes worked in forward but are not listed by the OpInfo: {torch.float16}.
     # Align with CPU implementation since,
     # 1. most cases of nextafter require Half dtype.
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 96e43058a..6d989fa14 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -209,6 +209,7 @@
     "aminmax",
     "argmin",
     "conj_physical",
+    "histogram",
     "repeat_interleave",
     "fmax",
     "fmin",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 48ddc4c3f..224c0e7d8 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -660,6 +660,10 @@ supported:
   - ceil
   - ceil_
   - ceil.out
+  - histogram.bins_tensor
+  - histogram.bins_tensor_out
+  - histogram.bin_ct
+  - histogram.bin_ct_out
   - repeat_interleave.Tensor
   - norm.ScalarOpt_dim_dtype
   - norm.dtype_out

From bbd200abe2d3a3e290c45e04127723f65293ed7e Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Thu, 25 Jul 2024 18:27:53 +0800
Subject: [PATCH 8/8] Timm_regnet regression which tracked in issue 603 (#650)

Skip check and tracked in
https://github.com/intel/torch-xpu-ops/issues/603
---
 .github/ci_expected_accuracy/inductor_torchbench_training.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_expected_accuracy/inductor_torchbench_training.csv b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
index 2984ecbc9..dc766eac0 100644
--- a/.github/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/.github/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -93,7 +93,7 @@ tacotron2,fail_to_run,fail_to_run,fail_to_run,fail_to_run,fail_to_run
 timm_efficientdet,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load,model_fail_to_load
 timm_efficientnet,pass,pass,pass,pass,pass
 timm_nfnet,pass,pass,pass,pass,pass
-timm_regnet,pass,pass,pass,pass,pass
+timm_regnet,pass,fail_accuracy,pass,pass,pass
 timm_resnest,pass,pass,pass,pass,pass
 timm_vision_transformer,pass,pass,pass,pass,pass
 timm_vision_transformer_large,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip,pass_due_to_skip