From 519cefe84ca9ae936138005eab664f6e98e4acdf Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Wed, 3 Jul 2024 14:58:01 +0800
Subject: [PATCH 01/20] Add aten::lerp_Tensor/lerp_Scalar and their variants
 (#449)

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Lerp.cpp             | 110 +++++++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template |   2 -
 src/ATen/native/xpu/sycl/LerpKernels.cpp |  92 +++++++++++++++++++
 src/ATen/native/xpu/sycl/LerpKernels.h   |  11 +++
 test/xpu/xpu_test_utils.py               |   1 +
 yaml/xpu_functions.yaml                  |   6 ++
 6 files changed, 220 insertions(+), 2 deletions(-)
 create mode 100644 src/ATen/native/xpu/Lerp.cpp
 create mode 100644 src/ATen/native/xpu/sycl/LerpKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/LerpKernels.h

diff --git a/src/ATen/native/xpu/Lerp.cpp b/src/ATen/native/xpu/Lerp.cpp
new file mode 100644
index 000000000..272417b39
--- /dev/null
+++ b/src/ATen/native/xpu/Lerp.cpp
@@ -0,0 +1,110 @@
+#include <ATen/ScalarOps.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <ATen/native/xpu/sycl/LerpKernels.h>
+
+namespace at {
+
+TensorIterator lerp_tensor_meta(
+    const Tensor& self,
+    const Tensor& end,
+    const Tensor& weight,
+    Tensor& out) {
+  TORCH_CHECK(
+      self.dtype() == end.dtype(),
+      "expected dtype ",
+      self.dtype(),
+      " for `end` but got dtype ",
+      end.dtype());
+  TORCH_CHECK(
+      self.dtype() == weight.dtype(),
+      "expected dtype ",
+      self.dtype(),
+      " for `weight` but got dtype ",
+      weight.dtype());
+  TensorIterator iter;
+  iter.build(TensorIteratorConfig()
+                 .add_output(out)
+                 .add_const_input(self)
+                 .add_const_input(end)
+                 .add_const_input(weight));
+  return iter;
+}
+
+Tensor XPUNativeFunctions::lerp(
+    const Tensor& self,
+    const Tensor& end,
+    const Tensor& weight) {
+  Tensor out;
+  auto iter = lerp_tensor_meta(self, end, weight, out);
+  native::xpu::lerp_tensor_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::lerp_(
+    Tensor& self,
+    const Tensor& end,
+    const Tensor& weight) {
+  auto iter = lerp_tensor_meta(self, end, weight, self);
+  native::xpu::lerp_tensor_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::lerp_out(
+    const Tensor& self,
+    const Tensor& end,
+    const Tensor& weight,
+    Tensor& out) {
+  auto iter = lerp_tensor_meta(self, end, weight, out);
+  native::xpu::lerp_tensor_kernel(iter);
+  return out;
+}
+
+TensorIterator lerp_scalar_meta(
+    const Tensor& self,
+    const Tensor& end,
+    const Scalar& /*weight*/,
+    Tensor& out) {
+  TORCH_CHECK(
+      self.dtype() == end.dtype(),
+      "expected dtype ",
+      self.dtype(),
+      " for `end` but got dtype ",
+      end.dtype());
+  TensorIterator iter;
+  iter.build_binary_op(out, self, end);
+  return iter;
+}
+
+Tensor XPUNativeFunctions::lerp(
+    const Tensor& self,
+    const Tensor& end,
+    const Scalar& weight) {
+  Tensor out;
+  auto iter = lerp_scalar_meta(self, end, weight, out);
+  native::xpu::lerp_scalar_kernel(iter, weight);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::lerp_(
+    Tensor& self,
+    const Tensor& end,
+    const Scalar& weight) {
+  auto iter = lerp_scalar_meta(self, end, weight, self);
+  native::xpu::lerp_scalar_kernel(iter, weight);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::lerp_out(
+    const Tensor& self,
+    const Tensor& end,
+    const Scalar& weight,
+    Tensor& out) {
+  auto iter = lerp_scalar_meta(self, end, weight, out);
+  native::xpu::lerp_scalar_kernel(iter, weight);
+  return out;
+}
+
+} // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 2076c2f0e..bc26c2507 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -244,8 +244,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "isposinf.out",
     "kthvalue.values",
     "lcm.out",
-    "lerp.Scalar_out",
-    "lerp.Tensor_out",
     "lgamma.out",
     "linalg_cholesky_ex.L",
     "linalg_cross.out",
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.cpp b/src/ATen/native/xpu/sycl/LerpKernels.cpp
new file mode 100644
index 000000000..b0f480ac3
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/LerpKernels.cpp
@@ -0,0 +1,92 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/Lerp.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct LerpTensorComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(scalar_t self_val, scalar_t end_val, scalar_t weight_val)
+      const {
+    opmath_t self_val_f = self_val;
+    opmath_t end_val_f = end_val;
+    opmath_t weight_val_f = weight_val;
+    return lerp(self_val, end_val, weight_val);
+  }
+};
+
+template <typename scalar_t>
+struct LerpTensorFunctor {
+  scalar_t operator()(scalar_t self_val, scalar_t end_val, scalar_t weight_val)
+      const {
+    return lerp(self_val, end_val, weight_val);
+  }
+};
+
+template <typename scalar_t>
+struct LerpScalarComplexFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(scalar_t self_val, scalar_t end_val) const {
+    opmath_t self_val_f = self_val;
+    opmath_t end_val_f = end_val;
+    return lerp(self_val, end_val, weight_val_);
+  }
+
+  LerpScalarComplexFunctor(opmath_t weight_val) : weight_val_(weight_val) {}
+
+ private:
+  opmath_t weight_val_;
+};
+
+template <typename scalar_t>
+struct LerpScalarFunctor {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t operator()(scalar_t self_val, scalar_t end_val) const {
+    return lerp(self_val, end_val, weight_val_);
+  }
+
+  LerpScalarFunctor(opmath_t weight_val) : weight_val_(weight_val) {}
+
+ private:
+  opmath_t weight_val_;
+};
+
+void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
+      gpu_kernel(iter, LerpTensorComplexFunctor<scalar_t>());
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
+          gpu_kernel(iter, LerpTensorFunctor<scalar_t>());
+        });
+  }
+}
+
+void lerp_scalar_kernel(
+    at::TensorIteratorBase& iter,
+    const c10::Scalar& weight) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_xpu", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      auto weight_val = weight.to<opmath_t>();
+      gpu_kernel(iter, LerpScalarComplexFunctor<scalar_t>(weight_val));
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "lerp_xpu", [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          auto weight_val = weight.to<opmath_t>();
+          gpu_kernel(iter, LerpScalarFunctor<scalar_t>(weight_val));
+        });
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LerpKernels.h b/src/ATen/native/xpu/sycl/LerpKernels.h
new file mode 100644
index 000000000..c455adee8
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/LerpKernels.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void lerp_tensor_kernel(TensorIteratorBase& iter);
+
+void lerp_scalar_kernel(TensorIteratorBase& iter, const c10::Scalar& weight);
+
+} // namespace at::native::xpu
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 585a7c96f..ee07c5fbc 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -149,6 +149,7 @@
     "_batch_norm_with_update",
     "bincount",
     "renorm",
+    "lerp",
 ]
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 0b30a961a..9f10825d2 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -65,6 +65,12 @@ supported:
   - le.Tensor
   - le.Tensor_out
   - le_.Tensor
+  - lerp.Tensor
+  - lerp.Tensor_out
+  - lerp_.Tensor
+  - lerp.Scalar
+  - lerp.Scalar_out
+  - lerp_.Scalar
   - gt.Scalar
   - gt.Scalar_out
   - gt_.Scalar

From 93f1b23fe078088c71c083a5a1bc782d229c88d0 Mon Sep 17 00:00:00 2001
From: majing <jing1.ma@intel.com>
Date: Wed, 3 Jul 2024 15:01:03 +0800
Subject: [PATCH 02/20] Add
 aten::upsample_bilinear2d/upsample_bilinear2d_backward (#422)

Signed-off-by: majing <Jing1.Ma@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/{sycl => }/UpSample.h     |   9 +-
 src/ATen/native/xpu/UpSampleBicubic2d.cpp     |   4 +-
 src/ATen/native/xpu/UpSampleBilinear2d.cpp    | 160 +++++++
 src/ATen/native/xpu/XPUFallback.template      |   2 -
 src/ATen/native/xpu/sycl/GridSampler.cpp      |   4 +-
 .../xpu/sycl/UpSampleBicubic2dKernels.cpp     |   2 +-
 .../xpu/sycl/UpSampleBilinear2dKernels.cpp    | 414 ++++++++++++++++++
 .../xpu/sycl/UpSampleBilinear2dKernels.h      |  24 +
 .../xpu/sycl/UpSampleNearest1dKernels.h       |  12 +-
 .../xpu/sycl/UpSampleNearest2dKernels.h       |  12 +-
 test/xpu/extended/run_test_with_skip.py       |   5 +
 test/xpu/run_test_with_skip.py                |   5 +
 test/xpu/test_nn_xpu.py                       | 153 +++----
 test/xpu/xpu_test_utils.py                    |   1 +
 yaml/xpu_functions.yaml                       |   4 +
 15 files changed, 702 insertions(+), 109 deletions(-)
 rename src/ATen/native/xpu/{sycl => }/UpSample.h (97%)
 create mode 100644 src/ATen/native/xpu/UpSampleBilinear2d.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h

diff --git a/src/ATen/native/xpu/sycl/UpSample.h b/src/ATen/native/xpu/UpSample.h
similarity index 97%
rename from src/ATen/native/xpu/sycl/UpSample.h
rename to src/ATen/native/xpu/UpSample.h
index ffa8e9c56..5ca47c4d4 100644
--- a/src/ATen/native/xpu/sycl/UpSample.h
+++ b/src/ATen/native/xpu/UpSample.h
@@ -10,9 +10,9 @@
 #include <ATen/TensorUtils.h>
 #include <math.h>
 
-namespace at::native {
+namespace at::native::xpu {
 
-inline std::array<int64_t, 4> upsample_2d_common_check(
+inline C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(
     IntArrayRef input_size,
     IntArrayRef output_size) {
   TORCH_CHECK(
@@ -49,7 +49,6 @@ inline std::array<int64_t, 4> upsample_2d_common_check(
 
   return {nbatch, channels, output_height, output_width};
 }
-namespace xpu {
 
 inline size_t idx_cl(
     const size_t n,
@@ -229,6 +228,4 @@ static scalar_t upsample_get_value_bounded(
   return data[batch][channel][access_y][access_x];
 }
 
-} // namespace xpu
-
-} // namespace at::native
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/UpSampleBicubic2d.cpp b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
index d59945135..509d6e449 100644
--- a/src/ATen/native/xpu/UpSampleBicubic2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
@@ -1,6 +1,6 @@
 #include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/native/xpu/sycl/UpSample.h>
+#include <ATen/native/xpu/UpSample.h>
 #include <ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
@@ -15,7 +15,7 @@ void upsample_bicubic2d_meta(
     std::optional<double> scales_h,
     std::optional<double> scales_w) {
   auto full_output_size =
-      native::upsample_2d_common_check(input.sizes(), output_size);
+      native::xpu::upsample_2d_common_check(input.sizes(), output_size);
 
   // Allow for empty batch size but not other dimensions
   TORCH_CHECK(
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
new file mode 100644
index 000000000..f0ace4344
--- /dev/null
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -0,0 +1,160 @@
+#include <ATen/Context.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+#include <ATen/native/xpu/UpSample.h>
+#include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
+#include <comm/RegisterUtils.h>
+
+namespace at {
+
+void upsample_bilinear2d_meta(
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    Tensor& output) {
+  auto full_output_size =
+      native::xpu::upsample_2d_common_check(input.sizes(), output_size);
+
+  // Allow for empty batch size but not other dimensions
+  TORCH_CHECK(
+      input.numel() != 0 ||
+          c10::multiply_integers(
+              input.sizes().begin() + 1, input.sizes().end()),
+      "Non-empty 4D data tensor expected but got a tensor with sizes ",
+      input.sizes());
+
+  auto memory_format = input.suggest_memory_format();
+  if (output.defined()) {
+    xpu::resize_out(
+        output,
+        full_output_size,
+        {},
+        input.options().memory_format(memory_format));
+  } else {
+    output = at::xpu::create_out(
+        full_output_size, {}, input.options().memory_format(memory_format));
+  }
+}
+
+void upsample_bilinear2d_backward_meta(
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    Tensor& grad_input) {
+  auto full_output_size =
+      native::xpu::upsample_2d_common_check(input_size, output_size);
+
+  TORCH_CHECK(
+      grad_output.dim() == 4,
+      "Expected grad_output to be a tensor of dimension 4 but got: dimension ",
+      grad_output.dim());
+
+  for (const auto i : c10::irange(4)) {
+    TORCH_CHECK(
+        grad_output.size(i) == full_output_size[i],
+        "Expected grad_output to have the same shape as output;",
+        " output.size(",
+        i,
+        ") = ",
+        full_output_size[i],
+        " but got grad_output.size(",
+        i,
+        ") = ",
+        grad_output.size(i));
+  }
+
+  auto memory_format = grad_output.suggest_memory_format();
+  if (grad_input.defined()) {
+    xpu::resize_out(
+        grad_input,
+        input_size,
+        {},
+        grad_output.options().memory_format(memory_format));
+  } else {
+    grad_input = at::xpu::create_out(
+        input_size, {}, grad_output.options().memory_format(memory_format));
+  }
+}
+
+Tensor& XPUNativeFunctions::upsample_bilinear2d_out(
+    const Tensor& self,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    Tensor& output) {
+  upsample_bilinear2d_meta(
+      self, output_size, align_corners, scales_h, scales_w, output);
+  native::xpu::upsample_bilinear2d_out_kernel(
+      output, self, output_size, align_corners, scales_h, scales_w);
+  return output;
+}
+
+Tensor XPUNativeFunctions::upsample_bilinear2d(
+    const Tensor& self,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  Tensor output;
+  upsample_bilinear2d_out(
+      self, output_size, align_corners, scales_h, scales_w, output);
+  return output;
+}
+
+Tensor& XPUNativeFunctions::upsample_bilinear2d_backward_out(
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    Tensor& grad_input) {
+  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_xpu");
+
+  upsample_bilinear2d_backward_meta(
+      grad_output,
+      output_size,
+      input_size,
+      align_corners,
+      scales_h,
+      scales_w,
+      grad_input);
+
+  native::xpu::upsample_bilinear2d_backward_out_kernel(
+      grad_input,
+      grad_output,
+      output_size,
+      input_size,
+      align_corners,
+      scales_h,
+      scales_w);
+  return grad_input;
+}
+
+Tensor XPUNativeFunctions::upsample_bilinear2d_backward(
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  Tensor grad_input;
+  upsample_bilinear2d_backward_out(
+      grad_output,
+      output_size,
+      input_size,
+      align_corners,
+      scales_h,
+      scales_w,
+      grad_input);
+  return grad_input;
+}
+
+} // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index bc26c2507..40f32b5a5 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -382,8 +382,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "unique_consecutive",
     "upsample_bicubic2d_backward.grad_input",
     "_upsample_bilinear2d_aa.out",
-    "upsample_bilinear2d_backward.grad_input",
-    "upsample_bilinear2d.out",
     "upsample_linear1d_backward.grad_input",
     "upsample_linear1d.out",
     "upsample_nearest3d.out",
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index bd8e43056..746e2b035 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -11,8 +11,8 @@
 #include <comm/SYCLContext.h>
 #include <comm/TensorInfo.h>
 
-#include "GridSampler.h"
-#include "UpSample.h"
+#include <ATen/native/xpu/UpSample.h>
+#include <ATen/native/xpu/sycl/GridSampler.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index ef17e8eea..504f28d7b 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -3,8 +3,8 @@
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/ceil_div.h>
+#include <ATen/native/xpu/UpSample.h>
 #include <comm/SYCLContext.h>
-#include "UpSample.h"
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
new file mode 100644
index 000000000..1ab02435a
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -0,0 +1,414 @@
+#pragma clang diagnostic push
+#pragma GCC diagnostic push
+// Avoid SYCL compiler return-type error
+#pragma clang diagnostic ignored "-Wreturn-type"
+#pragma GCC diagnostic ignored "-Wreturn-type"
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ceil_div.h>
+
+#include <ATen/native/xpu/UpSample.h>
+#include <ATen/native/xpu/sycl/Atomics.h>
+#include <comm/SYCLContext.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t, typename accscalar_t>
+struct UpsampleBilinear2dKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    int index = item.get_global_linear_id();
+
+    if (index < n_) {
+      const int output_x = index % output_width_;
+      const int output_y = index / output_width_;
+
+      const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+          rheight_, output_y, align_corners_, /*cubic=*/false);
+      const int h1 = h1r;
+      const int h1p = (h1 < input_height_ - 1) ? 1 : 0;
+      const accscalar_t h1lambda = h1r - h1;
+      const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+      const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+          rwidth_, output_x, align_corners_, /*cubic=*/false);
+      const int w1 = w1r;
+      const int w1p = (w1 < input_width_ - 1) ? 1 : 0;
+      const accscalar_t w1lambda = w1r - w1;
+      const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+      auto odata = out_data_acc_;
+      for (int n = 0; n < nbatch_; n++) {
+        for (int c = 0; c < channels_; ++c) {
+          const accscalar_t val = h0lambda *
+                  (w0lambda * in_data_acc_[n][c][h1][w1] +
+                   w1lambda * in_data_acc_[n][c][h1][w1 + w1p]) +
+              h1lambda *
+                  (w0lambda * in_data_acc_[n][c][h1 + h1p][w1] +
+                   w1lambda * in_data_acc_[n][c][h1 + h1p][w1 + w1p]);
+          odata[n][c][output_y][output_x] = static_cast<scalar_t>(val);
+        }
+      }
+    }
+  }
+  UpsampleBilinear2dKernelFunctor(
+      const int n,
+      const accscalar_t rheight,
+      const accscalar_t rwidth,
+      const bool align_corners,
+      const PackedTensorAccessor<scalar_t, 4> idata_acc,
+      PackedTensorAccessor<scalar_t, 4> odata_acc,
+      int64_t input_height,
+      int64_t input_width,
+      int64_t output_height,
+      int64_t output_width,
+      int64_t nbatch,
+      int64_t channels)
+      : n_(n),
+        rheight_(rheight),
+        rwidth_(rwidth),
+        align_corners_(align_corners),
+        in_data_acc_(idata_acc),
+        out_data_acc_(odata_acc),
+        input_height_(input_height),
+        input_width_(input_width),
+        output_height_(output_height),
+        output_width_(output_width),
+        nbatch_(nbatch),
+        channels_(channels) {}
+
+ private:
+  const int n_;
+  const accscalar_t rheight_;
+  const accscalar_t rwidth_;
+  const bool align_corners_;
+  const PackedTensorAccessor<scalar_t, 4> in_data_acc_;
+  PackedTensorAccessor<scalar_t, 4> out_data_acc_;
+  int64_t input_height_;
+  int64_t input_width_;
+  int64_t output_height_;
+  int64_t output_width_;
+  int64_t nbatch_;
+  int64_t channels_;
+};
+
+template <typename scalar_t, typename accscalar_t>
+void launch_upsample_bilinear2d_kernel(
+    const int n,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor<scalar_t, 4> idata_acc,
+    PackedTensorAccessor<scalar_t, 4> odata_acc,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t nbatch,
+    int64_t channels) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t wg_size = syclMaxWorkGroupSize();
+  int num_group = at::ceil_div(n, (int)wg_size);
+
+  UpsampleBilinear2dKernelFunctor<scalar_t, accscalar_t> kfn(
+      n,
+      rheight,
+      rwidth,
+      align_corners,
+      idata_acc,
+      odata_acc,
+      input_height,
+      input_width,
+      output_height,
+      output_width,
+      nbatch,
+      channels);
+
+  sycl_kernel_submit(
+      sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
+}
+
+size_t idx(
+    const size_t nc,
+    const size_t height,
+    const size_t width,
+    const size_t y,
+    const size_t x) {
+  return (nc * height + y) * width + x;
+}
+
+template <typename scalar_t, typename accscalar_t>
+struct UpsampleBilinear2dBackwardKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    for (size_t index =
+             item.get_local_id(0) + item.get_group(0) * item.get_local_range(0);
+         index < o_numel_;
+         index += item.get_local_range(0) * item.get_group_range(0)) {
+      size_t index_temp = index;
+      const int w2 = index_temp % output_width_;
+      index_temp /= output_width_;
+      const int h2 = index_temp % output_height_;
+      const size_t nc = index_temp / output_height_;
+
+      const accscalar_t h1r = area_pixel_compute_source_index<scalar_t>(
+          rheight_, h2, align_corners_, /*cubic=*/false);
+      const int h1 = h1r;
+      const int h1p = (h1 < input_height_ - 1) ? 1 : 0;
+      const accscalar_t h1lambda = h1r - h1;
+      const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+      const accscalar_t w1r = area_pixel_compute_source_index<scalar_t>(
+          rwidth_, w2, align_corners_, /*cubic=*/false);
+      const int w1 = w1r;
+      const int w1p = (w1 < input_width_ - 1) ? 1 : 0;
+      const accscalar_t w1lambda = w1r - w1;
+      const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+      const scalar_t d2val = out_data_[index];
+
+      atomicAdd(
+          (sycl_global_ptr<
+              scalar_t>)(in_data_ + idx(nc, input_height_, input_width_, h1, w1)),
+          static_cast<scalar_t>(h0lambda * w0lambda * d2val));
+
+      atomicAdd(
+          (sycl_global_ptr<
+              scalar_t>)(in_data_ + idx(nc, input_height_, input_width_, h1, w1 + w1p)),
+          static_cast<scalar_t>(h0lambda * w1lambda * d2val));
+
+      atomicAdd(
+          (sycl_global_ptr<
+              scalar_t>)(in_data_ + idx(nc, input_height_, input_width_, h1 + h1p, w1)),
+          static_cast<scalar_t>(h1lambda * w0lambda * d2val));
+
+      atomicAdd(
+          (sycl_global_ptr<
+              scalar_t>)(in_data_ + idx(nc, input_height_, input_width_, h1 + h1p, w1 + w1p)),
+          static_cast<scalar_t>(h1lambda * w1lambda * d2val));
+    }
+  }
+  UpsampleBilinear2dBackwardKernelFunctor(
+      const size_t nc,
+      int64_t input_height,
+      int64_t input_width,
+      int64_t output_height,
+      int64_t output_width,
+      int64_t nbatch,
+      int64_t channels,
+      const accscalar_t rheight,
+      const accscalar_t rwidth,
+      const bool align_corners,
+      scalar_t* in_data,
+      const scalar_t* out_data,
+      const size_t o_numel,
+      const size_t i_numel)
+      : nc_(nc),
+        input_height_(input_height),
+        input_width_(input_width),
+        output_height_(output_height),
+        output_width_(output_width),
+        nbatch_(nbatch),
+        channels_(channels),
+        rheight_(rheight),
+        rwidth_(rwidth),
+        align_corners_(align_corners),
+        in_data_(in_data),
+        out_data_(out_data),
+        o_numel_(o_numel),
+        i_numel_(i_numel) {}
+
+ private:
+  const size_t nc_;
+  int64_t input_height_;
+  int64_t input_width_;
+  int64_t output_height_;
+  int64_t output_width_;
+  int64_t nbatch_;
+  int64_t channels_;
+  const accscalar_t rheight_;
+  const accscalar_t rwidth_;
+  const bool align_corners_;
+  scalar_t* in_data_;
+  const scalar_t* out_data_;
+  const size_t o_numel_;
+  const size_t i_numel_;
+};
+
+template <typename scalar_t, typename accscalar_t>
+void launch_upsample_bilinear2d_backward_kernel(
+    const size_t nc,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t nbatch,
+    int64_t channels,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    scalar_t* idata,
+    const scalar_t* odata) {
+  auto queue = getCurrentSYCLQueue();
+  int64_t wg_size = syclMaxWorkGroupSize();
+
+  const size_t o_numel = nc * output_width * output_height;
+  const size_t i_numel = nc * input_width * input_height;
+
+  const size_t num_kernels = nc * output_width * output_height;
+  int num_group = at::ceil_div((int64_t)num_kernels, (int64_t)wg_size);
+
+  UpsampleBilinear2dBackwardKernelFunctor<scalar_t, accscalar_t> kfn(
+      nc,
+      input_height,
+      input_width,
+      output_height,
+      output_width,
+      nbatch,
+      channels,
+      rheight,
+      rwidth,
+      align_corners,
+      idata,
+      odata,
+      o_numel,
+      i_numel);
+  sycl_kernel_submit(
+      sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
+}
+
+void upsample_bilinear2d_out_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input.size(0);
+  int channels = input.size(1);
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  if (input.sizes() == output.sizes()) {
+    output.copy_(input);
+    return;
+  }
+
+  const int num_kernels = output_height * output_width;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      input.scalar_type(),
+      "upsample_bilinear2d_xpu",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto idata_acc = input.packed_accessor64<scalar_t, 4>();
+        auto odata_acc = output.packed_accessor64<scalar_t, 4>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        // TODO:a faster kernel for channel last
+        launch_upsample_bilinear2d_kernel<scalar_t, accscalar_t>(
+            num_kernels,
+            rheight,
+            rwidth,
+            align_corners,
+            idata_acc,
+            odata_acc,
+            input_height,
+            input_width,
+            output_height,
+            output_width,
+            nbatch,
+            channels);
+      });
+}
+
+void upsample_bilinear2d_backward_out_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  grad_input.zero_();
+
+  if (grad_output_.sizes() == grad_input.sizes()) {
+    grad_input.copy_(grad_output_);
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      grad_output_.scalar_type(),
+      "upsample_bilinear2d_backward_xpu",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+
+        // TODO: using PackedTensorAccessor instead of copy
+        Tensor grad_input_c = grad_input.is_contiguous()
+            ? grad_input
+            : at::zeros(grad_input.sizes(), grad_input.options());
+        Tensor grad_output = grad_output_.contiguous();
+
+        scalar_t* idata = grad_input_c.data_ptr<scalar_t>();
+        scalar_t* odata = grad_output.data_ptr<scalar_t>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<scalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<scalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        // TODO: a faster kernel for channel last
+        launch_upsample_bilinear2d_backward_kernel<scalar_t, accscalar_t>(
+            nbatch * channels,
+            input_height,
+            input_width,
+            output_height,
+            output_width,
+            nbatch,
+            channels,
+            rheight,
+            rwidth,
+            align_corners,
+            idata,
+            odata);
+
+        if (!grad_input.is_contiguous()) {
+          grad_input.copy_(grad_input_c);
+        }
+      });
+}
+
+} // namespace at::native::xpu
+
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
new file mode 100644
index 000000000..3f75f79cf
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+
+void upsample_bilinear2d_out_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w);
+
+void upsample_bilinear2d_backward_out_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w);
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
index 2b98215f3..bb6dd83ff 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h
@@ -1,7 +1,9 @@
+#pragma once
+
 #include <ATen/ATen.h>
-#include <ATen/native/xpu/sycl/UpSample.h>
-namespace at::native {
-namespace xpu {
+#include <ATen/native/xpu/UpSample.h>
+
+namespace at::native::xpu {
 
 void upsample_nearest1d_kernel(
     Tensor& output,
@@ -18,6 +20,4 @@ void upsample_nearest1d_backward_kernel(
     std::optional<double> scales,
     bool is_exact);
 
-} // namespace xpu
-
-} // namespace at::native
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
index 9adf2e73a..7d11e03af 100644
--- a/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
+++ b/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h
@@ -1,7 +1,9 @@
+#pragma once
+
 #include <ATen/ATen.h>
-#include <ATen/native/xpu/sycl/UpSample.h>
-namespace at::native {
-namespace xpu {
+#include <ATen/native/xpu/UpSample.h>
+
+namespace at::native::xpu {
 
 void upsample_nearest2d_kernel(
     Tensor& output,
@@ -20,6 +22,4 @@ void upsample_nearest2d_backward_kernel(
     c10::optional<double> scales_w,
     bool is_exact);
 
-} // namespace xpu
-
-} // namespace at::native
+} // namespace at::native::xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 06ebb4529..560e23830 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -91,6 +91,11 @@
     # https://github.com/intel/torch-xpu-ops/issues/412
     "test_compare_cpu_abs_xpu_bool",
 
+    # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
+    # Not in CUDA test scope too
+    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
+    "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16",
+
     # CPU result is not golden reference
     "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16",
     "test_compare_cpu_nn_functional_group_norm_xpu_float16",
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index fb2f24de8..9287a85b6 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1365,6 +1365,9 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half",
     "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half",
     "test_MultiLabelMarginLoss_no_batch_dim_sum_cuda_half",
+    # align CUDA to skip, XPU implementation is not yet supporting uint8
+    "test_upsamplingBiMode2d_consistency",
+    "test_upsamplingBiLinear2d_consistency_interp_size_bug",
 )
 res += launch_test("test_nn_xpu.py", skip_list)
 
@@ -2898,6 +2901,8 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_cuda_vitals_gpu_only_xpu",
     # torch.utils.swap_tensors AssertionError: RuntimeError not raised
     "test_swap_basic",
+    # Needs pr to enable deterministic implementation for interpolate op
+    "test_deterministic_interpolate_bilinear_xpu",
 
     # Precision error
     # Fail in high probability in preci.
diff --git a/test/xpu/test_nn_xpu.py b/test/xpu/test_nn_xpu.py
index 897126128..b91800473 100644
--- a/test/xpu/test_nn_xpu.py
+++ b/test/xpu/test_nn_xpu.py
@@ -2131,90 +2131,6 @@ def issue_24823_2():
     issue_24823_2()
 TestNNDeviceType.test_grid_sample_large=_test_grid_sample_large
 
-@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
-@parametrize_test("mode", ["bilinear", "bicubic"])
-@parametrize_test("antialias", [True, False])
-@parametrize_test("align_corners", [True, False])
-@parametrize_test("num_channels", [3, 5])
-@parametrize_test("output_size", [32, 600])
-@parametrize_test("check_as_unsqueezed_3d_tensor", [True, False])
-@parametrize_test("non_contig", [False, "sliced", "restrided"])
-@parametrize_test("batch_size", [1, 5])
-def upsamplingBiMode2d_consistency(
-    self,
-    device,
-    memory_format,
-    mode,
-    antialias,
-    align_corners,
-    num_channels,
-    output_size,
-    check_as_unsqueezed_3d_tensor,
-    non_contig,
-    batch_size,
-):
-    # Check output value consistency between resized_input_uint8 and resized input_float
-    if torch.device(device).type == "xpu":
-        raise SkipTest("XPU implementation is not yet supporting uint8")
-
-    torch.manual_seed(0)
-
-    # - input range is set to [30, 220] for bicubic mode, because the bicubic kernel may create
-    #   [intermediate] values outside of the [0, 255] range, which need
-    #   to be clipped in uint8 path, but not in float path. This isn't
-    #   an issue with bilinear kernel.
-    input_range = (30, 220) if mode == "bicubic" else (0, 256)
-    input_ui8 = torch.randint(*input_range, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device)
-    input_ui8 = input_ui8.contiguous(memory_format=memory_format)
-
-    if non_contig == "sliced":
-        input_ui8 = input_ui8[:, :, 10:-10, 10:-10]
-    elif non_contig == "restrided":
-        input_ui8 = input_ui8[:, :, ::2, ::2]
-
-    if batch_size == 1 and check_as_unsqueezed_3d_tensor:
-        input_ui8 = input_ui8[0, ...]
-        input_ui8 = input_ui8[None, ...]
-
-    input_f32 = input_ui8.float()
-
-    output_f32 = F.interpolate(
-        input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
-    ).round().clip(0, 255)
-    output_ui8 = F.interpolate(
-        input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
-    )
-
-    if non_contig is False:
-        self.assertTrue(input_ui8.is_contiguous(memory_format=memory_format))
-
-    # FIXME if-clause shows the current behaviour which is definitely unexpected.
-    # Ideally we want to fix it such that both the ui8 and f32 outputs are also channels_last
-    # See for more details: https://github.com/pytorch/pytorch/pull/100373
-    if batch_size == 1 and check_as_unsqueezed_3d_tensor and memory_format == torch.channels_last:
-        self.assertTrue(output_ui8.is_contiguous())
-        self.assertTrue(output_f32.is_contiguous())
-    else:
-        self.assertTrue(output_ui8.is_contiguous(memory_format=memory_format))
-        self.assertTrue(output_f32.is_contiguous(memory_format=memory_format))
-
-    if mode == "bilinear":
-        torch.testing.assert_close(output_f32, output_ui8.float(), rtol=0, atol=1)
-    else:
-        diff = (output_f32 - output_ui8.float()).abs()
-        self.assertLess(diff.max(), 15)
-
-        threshold = 2
-        percent = 3
-        self.assertLess((diff > threshold).float().mean(), percent / 100)
-
-        threshold = 5
-        percent = 1
-        self.assertLess((diff > threshold).float().mean(), percent / 100)
-
-        self.assertLess(diff.mean(), 0.4)
-TestNNDeviceType.test_upsamplingBiMode2d_consistency = upsamplingBiMode2d_consistency
-
 def _test_grid_sample_half_precision(self):
     def helper(shape_in, shape_out, align_corners):
         for mode in ('bilinear', 'nearest', 'bicubic'):
@@ -2255,6 +2171,75 @@ def helper(shape_in, shape_out, align_corners):
     # helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), False) # grid_sampler_3d is not supported in xpu
 TestNNDeviceType.test_grid_sample_bfloat16_precision=_test_grid_sample_bfloat16_precision
 
+@parametrize_test("antialias", [True, False])
+@parametrize_test("align_corners", [True, False])
+@parametrize_test("mode", ["bilinear", "bicubic"])
+@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+def upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format):
+    # Forward AD does not support XLA because XLA tensors don't have storage
+    check_forward_ad = torch.device(device).type != 'xla'
+
+    kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias)
+    # test float scale factor up & downsampling
+    for scale_factor in [0.5, 1.5, 2]:
+        in_t = torch.ones(
+            2, 3, 8, 8, device=device,
+            dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
+        out_size = int(math.floor(in_t.shape[-1] * scale_factor))
+        with warnings.catch_warnings(record=True) as w:
+            out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs)
+        expected_out = torch.ones(2, 3, out_size, out_size, device=device, dtype=torch.double)
+        self.assertEqual(expected_out, out_t)
+        # Assert that memory format is carried through to the output
+        self.assertTrue(out_t.is_contiguous(memory_format=memory_format))
+        out_t.backward(torch.randn_like(out_t))
+        self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format))
+
+        if torch.device(device).type == 'xpu':
+            # Bilinear backward is nondeterministic because of atomicAdd usage
+            nondet_tol = 1e-5
+        else:
+            nondet_tol = 0.0
+
+        input = torch.randn(
+            2, 3, 8, 8, device=device,
+            dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_()
+        gradcheck(
+            lambda x: F.interpolate(x, out_size, **kwargs),
+            [input],
+            check_forward_ad=check_forward_ad, nondet_tol=nondet_tol
+        )
+        gradgradcheck(
+            lambda x: F.interpolate(x, out_size, **kwargs),
+            [input],
+            check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol
+        )
+
+        # Assert that cpu and cuda give same results
+        if torch.device(device).type == 'xpu':
+            for shapes in [
+                (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2)
+            ]:
+                a_xpu = torch.randn(
+                    *shapes, device=device, dtype=torch.double
+                ).contiguous(memory_format=memory_format).requires_grad_()
+                a_cpu = a_xpu.detach().cpu().requires_grad_()
+
+                with warnings.catch_warnings(record=True):
+                    out_xpu = F.interpolate(a_xpu, scale_factor=scale_factor, **kwargs)
+                    out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs)
+
+                self.assertEqual(out_cpu, out_xpu.cpu())
+
+                g_cuda = torch.randn_like(out_xpu)
+                g_cpu = g_cuda.cpu()
+
+                out_xpu.backward(g_cuda)
+                out_cpu.backward(g_cpu)
+
+                self.assertEqual(a_xpu.grad, a_cpu.grad)
+TestNNDeviceType.test_upsamplingBiMode2d = upsamplingBiMode2d
+
 instantiate_device_type_tests(
     TestNNDeviceType, globals(), only_for="xpu", allow_xpu=True
 )
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index ee07c5fbc..61a2c6b0d 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -130,6 +130,7 @@
     "nn.functional.unfold",
     "nn.functional.pad",
     "nn.functional.interpolate",
+    "nn.functional.upsample_bilinear",
     "nn.functional.upsample_nearest",
     # "nn.functional.nll_loss", # Lack of XPU implementation of aten::nll_loss2d_forward. Will retrieve the case, only if the op is implemented.
     "nn.functional.mse_loss",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 9f10825d2..751385d7a 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -418,6 +418,10 @@ supported:
   - _batch_norm_with_update
   - _batch_norm_with_update.out
   - batch_norm_backward
+  - upsample_bilinear2d
+  - upsample_bilinear2d.out
+  - upsample_bilinear2d_backward
+  - upsample_bilinear2d_backward.grad_input
   - _upsample_nearest_exact1d
   - _upsample_nearest_exact1d.out
   - upsample_nearest1d

From 4b561238707723c4cba3cf02a25fd98c2725b761 Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Wed, 3 Jul 2024 18:36:53 +0800
Subject: [PATCH 03/20] Add aten::adaptive_avg_pool2d (#445)

Supported Op List:
- [x] adaptive_avg_pool2d.out
- [x] _adaptive_avg_pool2d

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 .../native/xpu/AdaptiveAveragePooling2d.cpp   | 174 +++++-
 src/ATen/native/xpu/XPUFallback.template      |   1 -
 .../sycl/AdaptiveAveragePooling2dKernel.cpp   | 322 -----------
 .../xpu/sycl/AdaptiveAveragePooling2dKernel.h |  12 -
 .../sycl/AdaptiveAveragePooling2dKernels.cpp  | 513 ++++++++++++++++++
 .../sycl/AdaptiveAveragePooling2dKernels.h    |  17 +
 yaml/xpu_functions.yaml                       |   2 +
 7 files changed, 689 insertions(+), 352 deletions(-)
 delete mode 100644 src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp
 delete mode 100644 src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.h
 create mode 100644 src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h

diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index 3054992e8..b09d1c8c0 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -1,32 +1,172 @@
 #include <ATen/ATen.h>
+#include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/core/Tensor.h>
-#include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.h>
+#include <ATen/native/AdaptivePooling.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <aten/src/ATen/ops/mean_ops.h>
+#endif
+
+#include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
+
 namespace at {
 
+namespace {
+
+static c10::SymInt _safe_size(c10::SymIntArrayRef sizes, c10::IntArrayRef dim) {
+  c10::SymInt size = 1;
+  if (sizes.empty()) {
+    return 1;
+  }
+  for (auto d : dim) {
+    d = at::maybe_wrap_dim(d, static_cast<int64_t>(sizes.size()));
+    size *= sizes[d];
+  }
+  return size;
+}
+
+Tensor unsqueeze_multiple(
+    const Tensor& t,
+    OptionalIntArrayRef opt_dim,
+    size_t n_dims) {
+  if (opt_dim.has_value()) {
+    IntArrayRef dim = opt_dim.value();
+    auto dim_size = dim.size();
+    // Optimisation for two common cases
+    if (dim_size == 0) {
+      return t;
+    } else if (dim_size == 1) {
+      return t.unsqueeze(dim[0]);
+    }
+  }
+  auto dims_to_unsqueeze = at::dim_list_to_bitset(opt_dim, n_dims);
+  Tensor res = t;
+  for (const auto i : c10::irange(n_dims)) {
+    if (dims_to_unsqueeze[i]) {
+      res = res.unsqueeze(static_cast<int64_t>(i));
+    }
+  }
+  return res;
+}
+
+Tensor sum_backward(
+    const Tensor& grad,
+    c10::SymIntArrayRef sizes,
+    OptionalIntArrayRef opt_dims,
+    bool keepdim) {
+  if (!keepdim && !sizes.empty()) {
+    if (opt_dims.has_value() && !opt_dims.value().empty()) {
+      return unsqueeze_multiple(grad, opt_dims, sizes.size())
+          .expand_symint(sizes);
+    }
+  }
+  return grad.expand_symint(sizes);
+}
+
+Tensor mean_backward(
+    const Tensor& grad,
+    c10::SymIntArrayRef shape,
+    OptionalIntArrayRef opt_dim,
+    c10::SymInt numel,
+    bool keepdim) {
+  bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty();
+  auto n =
+      is_all_reduce ? std::move(numel) : _safe_size(shape, opt_dim.value());
+  return sum_backward(grad, shape, opt_dim, keepdim) / std::move(n);
+}
+} // namespace
+
 Tensor XPUNativeFunctions::_adaptive_avg_pool2d_backward(
-    const Tensor& grad_output_,
-    const Tensor& input_) {
+    const Tensor& grad_output,
+    const Tensor& input) {
+  TensorArg grad_output_arg{grad_output, "grad_output", 1},
+      input_arg{input, "input", 2};
+
+  native::adaptive_pool_empty_output_check(
+      grad_output, "adaptive_avg_pool2d_backward");
+
+  checkAllSameGPU(__func__, {grad_output_arg, input_arg});
+
+  TORCH_CHECK(
+      (input.ndimension() == 3 || input.ndimension() == 4),
+      "non-empty 3D or 4D (batch mode) tensor expected for input");
+
+  if (grad_output.size(-1) == 1 && grad_output.size(-2) == 1) {
+    return mean_backward(
+        grad_output,
+        input.sym_sizes().vec(),
+        {-1, -2},
+        input.sym_numel(),
+        true);
+  }
+
+  globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward");
+
   Tensor grad_input;
-  if (input_.numel() != 0) {
-    Tensor input, grad_output;
-    if (input_.ndimension() == 3) {
-      input = input_.contiguous();
-      grad_output = grad_output_.contiguous();
-      grad_input = at::empty_like(input);
-    } else {
-      auto smf = input_.suggest_memory_format();
-      input = input_.contiguous(smf);
-      grad_output = grad_output_.contiguous(smf);
-      grad_input = at::empty_like(input_, smf);
-    }
-    native::xpu::adaptive_avg_pool2d_backward_out_kernel(
+  if (input.numel() != 0) {
+    native::xpu::adaptive_avg_pool2d_backward_kernel(
         grad_input, grad_output, input);
   } else {
-    grad_input = at::zeros_like(input_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
+
   return grad_input;
 }
 
+Tensor& XPUNativeFunctions::adaptive_avg_pool2d_out(
+    const Tensor& input,
+    IntArrayRef output_size,
+    Tensor& output) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
+  int64_t ndim = input.dim();
+  TORCH_CHECK(
+      (ndim == 3 || ndim == 4),
+      "adaptive_avg_pool2d(): Expected 3D or 4D tensor, but got ",
+      input.sizes());
+  for (const auto i : {-2, -1}) {
+    TORCH_CHECK(
+        input.size(i) > 0,
+        "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
+        "but input has sizes ",
+        input.sizes(),
+        " with dimension ",
+        i + ndim,
+        " being "
+        "empty");
+  }
+
+  if (output_size[0] == 1 && output_size[1] == 1) {
+    if (output.numel() == 0) {
+      output = input.mean({-1, -2}, /* keepdim = */ true);
+    } else {
+      at::mean_out(output, input, {-1, -2}, true, std::nullopt);
+    }
+    if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) {
+      // assert ndim == 4, since ndim = 3 doesn't give channels_last
+      const auto n = input.sym_size(0);
+      const auto c = input.sym_size(1);
+      output.as_strided__symint({n, c, 1, 1}, {c, 1, c, c});
+    }
+  } else {
+    native::xpu::adaptive_avg_pool2d_kernel(output, input, output_size);
+  }
+  return output;
+}
+
+Tensor XPUNativeFunctions::_adaptive_avg_pool2d(
+    at::Tensor const& input,
+    IntArrayRef output_size) {
+  auto output = at::empty({0}, input.options());
+  adaptive_avg_pool2d_out(input, output_size, output);
+  return output;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 40f32b5a5..1e7318f4f 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -157,7 +157,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
  */
 TORCH_LIBRARY_IMPL(aten, XPU, m) {
   std::vector<std::string> fallback_list = {
-    "_adaptive_avg_pool2d",
     "_adaptive_avg_pool3d",
     "_adaptive_avg_pool3d_backward",
     "adaptive_max_pool2d_backward.grad_input",
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp
deleted file mode 100644
index 108213065..000000000
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.cpp
+++ /dev/null
@@ -1,322 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/Config.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/native/AdaptivePooling.h>
-#include <ATen/native/Pool.h>
-#include <comm/MemoryFormat.h>
-#include <vector>
-
-namespace at::native::xpu {
-
-using namespace at::xpu;
-
-template <typename scalar_t, typename accscalar_t, bool is_channels_last>
-struct AdaptiveAvgPool2dBwdKernelFunctor {
-  void operator()(sycl::nd_item<1> item) const {
-    int64_t gi = item.get_global_linear_id();
-
-    for (int64_t i = gi; i < numel; i += global_range) {
-      int64_t _iw, _ih, _ic, _ib;
-      if constexpr (is_channels_last) {
-        _ic = i % ic;
-        _iw = i / ic % iw;
-        _ih = i / ic / iw % ih;
-        _ib = i / ic / iw / ih;
-      } else {
-        _iw = i % iw;
-        _ih = i / iw % ih;
-        _ic = i / iw / ih % ic;
-        _ib = i / iw / ih / ic;
-      }
-
-      int64_t _oh0 = native::start_index(_ih, ih, oh);
-      int64_t _oh1 = native::end_index(_ih, ih, oh);
-      int64_t _ow0 = native::start_index(_iw, iw, ow);
-      int64_t _ow1 = native::end_index(_iw, iw, ow);
-      int64_t _ob = _ib;
-      int64_t _oc = _ic;
-
-      accscalar_t gx = 0;
-      accscalar_t _ikh, _ikw;
-      for (int _oh = _oh0; _oh < _oh1; _oh++) {
-        _ikh = accscalar_t(1.0) /
-            (accscalar_t)(native::end_index(_oh, oh, ih) - native::start_index(_oh, oh, ih));
-        for (int _ow = _ow0; _ow < _ow1; _ow++) {
-          _ikw = accscalar_t(1.0) /
-              (accscalar_t)(native::end_index(_ow, ow, iw) - native::start_index(_ow, ow, iw));
-          gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw;
-        }
-      }
-
-      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
-                            int64_t _ib,
-                            int64_t _ic,
-                            int64_t _ih,
-                            int64_t _iw,
-                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
-      store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx);
-    }
-  }
-
-  AdaptiveAvgPool2dBwdKernelFunctor(
-      PackedTensorAccessor64<scalar_t, 4> gyacc_,
-      PackedTensorAccessor64<scalar_t, 4> gxacc_)
-      : gyacc(gyacc_), gxacc(gxacc_) {
-    ib = gxacc.size(0);
-    ic = gxacc.size(1);
-    ih = gxacc.size(2);
-    iw = gxacc.size(3);
-    oh = gyacc.size(2);
-    ow = gyacc.size(3);
-
-    numel = ib * ic * ih * iw;
-    int total_item = std::min(numel, syclMaxWorkItemsPerTile());
-    local_range = syclMaxWorkItemsPerEU();
-    global_range = total_item < local_range
-        ? local_range
-        : (total_item / local_range) * local_range;
-  }
-
-  sycl::range<1> glb_range() {
-    return sycl::range<1>(global_range);
-  }
-
-  sycl::range<1> loc_range() {
-    return sycl::range<1>(local_range);
-  }
-
- private:
-  int ib;
-  int ic;
-  int ih;
-  int iw;
-  int oh;
-  int ow;
-  int64_t numel;
-  int global_range;
-  int local_range;
-  PackedTensorAccessor64<scalar_t, 4> gyacc;
-  PackedTensorAccessor64<scalar_t, 4> gxacc;
-};
-
-template <typename scalar_t, typename accscalar_t, bool is_channels_last>
-struct AdaptiveAvgPool2dBwdSLMKernelFunctor
-    : public __SYCL_KER_CONFIG_CONVENTION__ {
-  void operator()(sycl::nd_item<1> item) const {
-    int64_t gi = item.get_global_linear_id();
-    int64_t li = item.get_local_id(0);
-
-    // for-loop order: oh*ow->ih->iw
-    // reuse oh*ow(oh0, oh1, ow0, ow1), ih(ikh), iw(ikw) in inner loop.
-    for (int _ih = li; _ih < ih; _ih += local_range) {
-      _oh0_cached[_ih] = (int)native::start_index(_ih, ih, oh);
-      _oh1_cached[_ih] = (int)native::end_index(_ih, ih, oh);
-    }
-    for (int _iw = li; _iw < iw; _iw += local_range) {
-      _ow0_cached[_iw] = (int)native::start_index(_iw, iw, ow);
-      _ow1_cached[_iw] = (int)native::end_index(_iw, iw, ow);
-    }
-    for (int _oh = li; _oh < oh; _oh += local_range) {
-      _ikh_cached[_oh] = accscalar_t(1.0) /
-          (accscalar_t)(native::end_index(_oh, oh, ih) -
-                        native::start_index(_oh, oh, ih));
-    }
-    for (int _ow = li; _ow < ow; _ow += local_range) {
-      _ikw_cached[_ow] = accscalar_t(1.0) /
-          (accscalar_t)(native::end_index(_ow, ow, iw) -
-                        native::start_index(_ow, ow, iw));
-    }
-
-    item.barrier(sycl_local_fence);
-
-    for (int64_t i = gi; i < numel; i += global_range) {
-      int64_t _iw, _ih, _ic, _ib;
-      if constexpr (is_channels_last) {
-        _ic = i % ic;
-        _iw = i / ic % iw;
-        _ih = i / ic / iw % ih;
-        _ib = i / ic / iw / ih;
-      } else {
-        _iw = i % iw;
-        _ih = i / iw % ih;
-        _ic = i / iw / ih % ic;
-        _ib = i / iw / ih / ic;
-      }
-
-      int64_t _oh0, _oh1, _ow0, _ow1;
-      _oh0 = _oh0_cached[_ih];
-      _oh1 = _oh1_cached[_ih];
-      _ow0 = _ow0_cached[_iw];
-      _ow1 = _ow1_cached[_iw];
-      int64_t _ob = _ib;
-      int64_t _oc = _ic;
-
-      accscalar_t gx = 0;
-      accscalar_t _ikh, _ikw;
-      for (int _oh = _oh0; _oh < _oh1; _oh++) {
-        _ikh = _ikh_cached[_oh];
-        for (int _ow = _ow0; _ow < _ow1; _ow++) {
-          _ikw = _ikw_cached[_ow];
-          gx += gyacc[_ob][_oc][_oh][_ow] * _ikh * _ikw;
-        }
-      }
-
-      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
-                            int64_t _ib,
-                            int64_t _ic,
-                            int64_t _ih,
-                            int64_t _iw,
-                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
-      store(gxacc, _ib, _ic, _ih, _iw, (scalar_t)gx);
-    }
-  }
-
-  void sycl_ker_config_convention(sycl::handler& cgh) {
-    _oh0_cached = sycl_local_acc_t<int>(ih, cgh);
-    _oh1_cached = sycl_local_acc_t<int>(ih, cgh);
-    _ow0_cached = sycl_local_acc_t<int>(iw, cgh);
-    _ow1_cached = sycl_local_acc_t<int>(iw, cgh);
-    _ikh_cached = sycl_local_acc_t<accscalar_t>(oh, cgh);
-    _ikw_cached = sycl_local_acc_t<accscalar_t>(ow, cgh);
-  }
-
-  AdaptiveAvgPool2dBwdSLMKernelFunctor(
-      PackedTensorAccessor64<scalar_t, 4> gyacc_,
-      PackedTensorAccessor64<scalar_t, 4> gxacc_)
-      : gyacc(gyacc_), gxacc(gxacc_) {
-    ib = gxacc.size(0);
-    ic = gxacc.size(1);
-    ih = gxacc.size(2);
-    iw = gxacc.size(3);
-    oh = gyacc.size(2);
-    ow = gyacc.size(3);
-
-    numel = ib * ic * ih * iw;
-    int total_item = std::min(numel, syclMaxWorkItemsPerTile());
-
-    local_range = syclMaxWorkGroupSize();
-    global_range = total_item < local_range
-        ? local_range
-        : (total_item / local_range) * local_range;
-  }
-
-  sycl::range<1> glb_range() {
-    return sycl::range<1>(global_range);
-  }
-
-  sycl::range<1> loc_range() {
-    return sycl::range<1>(local_range);
-  }
-
- private:
-  int ib;
-  int ic;
-  int ih;
-  int iw;
-  int oh;
-  int ow;
-  int64_t numel;
-  int local_range;
-  int global_range;
-  PackedTensorAccessor64<scalar_t, 4> gyacc;
-  PackedTensorAccessor64<scalar_t, 4> gxacc;
-  sycl_local_acc_t<int> _oh0_cached;
-  sycl_local_acc_t<int> _oh1_cached;
-  sycl_local_acc_t<int> _ow0_cached;
-  sycl_local_acc_t<int> _ow1_cached;
-  sycl_local_acc_t<accscalar_t> _ikh_cached;
-  sycl_local_acc_t<accscalar_t> _ikw_cached;
-};
-
-void adaptive_avg_pool2d_backward_out_kernel(
-    Tensor& gradInput,
-    const Tensor& gradOutput,
-    const Tensor& input) {
-  TensorArg grad_input_arg{gradInput, "gradInput", 1},
-      grad_output_arg{gradOutput, "gradOutput", 2},
-      input_arg{input, "input", 3};
-  adaptive_pool_empty_output_check(gradOutput, "adaptive_avg_pool2d_backward");
-  checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg});
-
-  TORCH_CHECK(
-      (input.ndimension() == 3 || input.ndimension() == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input");
-
-  auto outputHeight = gradOutput.size(-2);
-  auto outputWidth = gradOutput.size(-1);
-
-  const auto nInputPlane = input.size(-3);
-  const auto inputHeight = input.size(-2);
-  const auto inputWidth = input.size(-1);
-
-  int dH = std::floor((float)2 * inputHeight / outputHeight) -
-      (inputHeight / outputHeight);
-  int dW = std::floor((float)2 * inputWidth / outputWidth) -
-      (inputWidth / outputWidth);
-  std::vector<int64_t> stride_vec = {dH, dW};
-
-  int kH = std::ceil((float)2 * inputHeight / outputHeight) -
-      (inputHeight / outputHeight);
-  int kW = std::ceil((float)2 * inputWidth / outputWidth) -
-      (inputWidth / outputWidth);
-  std::vector<int64_t> kernel_size_vec = {kH, kW};
-
-  int padH = (dH * (outputHeight - 1) + kH - inputHeight) / 2;
-  int padW = (dW * (outputWidth - 1) + kW - inputWidth) / 2;
-  std::vector<int64_t> padding_vec = {padH, padW};
-
-  bool is_3d = gradOutput.ndimension() == 3;
-  if (is_3d) {
-    gradOutput.resize_({1, nInputPlane, outputHeight, outputWidth});
-    gradInput.resize_({1, nInputPlane, inputHeight, inputWidth});
-  }
-
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      at::ScalarType::BFloat16,
-      at::ScalarType::Half,
-      gradOutput.scalar_type(),
-      "adaptive_avg_pool2d_backward_xpu",
-      [&]() {
-        using accscalar_t = acc_type<scalar_t, false>;
-        auto gyacc = gradOutput.packed_accessor64<scalar_t, 4>();
-        auto gxacc = gradInput.packed_accessor64<scalar_t, 4>();
-
-        int64_t ohw01_shared_size =
-            ((inputHeight + inputWidth) * 2) * sizeof(int);
-        int64_t ikhw_shared_size =
-            (outputHeight + outputWidth) * sizeof(accscalar_t);
-        bool using_shared =
-            syclLocalMemSize() >= ohw01_shared_size + ikhw_shared_size;
-
-        auto& q = getCurrentSYCLQueue();
-        if (is_smf_channels_last(gradOutput)) {
-          if (using_shared) {
-            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, accscalar_t, true>
-                kfn(gyacc, gxacc);
-            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
-          } else {
-            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, accscalar_t, true> kfn(
-                gyacc, gxacc);
-            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
-          }
-        } else {
-          if (using_shared) {
-            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, accscalar_t, false>
-                kfn(gyacc, gxacc);
-            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
-          } else {
-            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, accscalar_t, false> kfn(
-                gyacc, gxacc);
-            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
-          }
-        }
-      });
-
-  if (is_3d) {
-    gradOutput.resize_({nInputPlane, outputHeight, outputWidth});
-    gradInput.resize_({nInputPlane, inputHeight, inputWidth});
-  }
-}
-
-} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.h
deleted file mode 100644
index e56609add..000000000
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernel.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <ATen/native/TensorIterator.h>
-
-namespace at::native::xpu {
-
-void adaptive_avg_pool2d_backward_out_kernel(
-    Tensor& gradInput,
-    const Tensor& gradOutput,
-    const Tensor& input);
-
-} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
new file mode 100644
index 000000000..aacc66062
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -0,0 +1,513 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/AdaptivePooling.h>
+#include <ATen/native/Pool.h>
+#include <comm/MemoryFormat.h>
+#include <vector>
+
+namespace at::native::xpu {
+
+using namespace at::xpu;
+
+template <typename scalar_t, typename opmath_t, bool is_channels_last>
+struct AdaptiveAvgPool2dBwdKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t gi = item.get_global_linear_id();
+
+    for (int64_t i = gi; i < numel_; i += global_range_) {
+      int64_t _iw, _ih, _ic, _ib;
+      if constexpr (is_channels_last) {
+        _ic = i % ic_;
+        _iw = i / ic_ % iw_;
+        _ih = i / ic_ / iw_ % ih_;
+        _ib = i / ic_ / iw_ / ih_;
+      } else {
+        _iw = i % iw_;
+        _ih = i / iw_ % ih_;
+        _ic = i / iw_ / ih_ % ic_;
+        _ib = i / iw_ / ih_ / ic_;
+      }
+
+      int64_t _oh0 = native::start_index(_ih, ih_, oh_);
+      int64_t _oh1 = native::end_index(_ih, ih_, oh_);
+      int64_t _ow0 = native::start_index(_iw, iw_, ow_);
+      int64_t _ow1 = native::end_index(_iw, iw_, ow_);
+      int64_t _ob = _ib;
+      int64_t _oc = _ic;
+
+      opmath_t gx = 0;
+      opmath_t _ikh, _ikw;
+      for (int _oh = _oh0; _oh < _oh1; _oh++) {
+        _ikh = opmath_t(1.0) /
+            (opmath_t)(native::end_index(_oh, oh_, ih_) - native::start_index(_oh, oh_, ih_));
+        for (int _ow = _ow0; _ow < _ow1; _ow++) {
+          _ikw = opmath_t(1.0) /
+              (opmath_t)(native::end_index(_ow, ow_, iw_) - native::start_index(_ow, ow_, iw_));
+          gx += gyacc_[_ob][_oc][_oh][_ow] * _ikh * _ikw;
+        }
+      }
+
+      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
+                            int64_t _ib,
+                            int64_t _ic,
+                            int64_t _ih,
+                            int64_t _iw,
+                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
+      store(gxacc_, _ib, _ic, _ih, _iw, (scalar_t)gx);
+    }
+  }
+
+  AdaptiveAvgPool2dBwdKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 4> gyacc,
+      PackedTensorAccessor64<scalar_t, 4> gxacc)
+      : gyacc_(gyacc), gxacc_(gxacc) {
+    ib_ = gxacc_.size(0);
+    ic_ = gxacc_.size(1);
+    ih_ = gxacc_.size(2);
+    iw_ = gxacc_.size(3);
+    oh_ = gyacc_.size(2);
+    ow_ = gyacc_.size(3);
+
+    numel_ = ib_ * ic_ * ih_ * iw_;
+    int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
+    local_range_ = syclMaxWorkItemsPerEU();
+    global_range_ = total_item < local_range_
+        ? local_range_
+        : (total_item / local_range_) * local_range_;
+  }
+
+  sycl::range<1> glb_range() {
+    return sycl::range<1>(global_range_);
+  }
+
+  sycl::range<1> loc_range() {
+    return sycl::range<1>(local_range_);
+  }
+
+ private:
+  int ib_;
+  int ic_;
+  int ih_;
+  int iw_;
+  int oh_;
+  int ow_;
+  int64_t numel_;
+  int global_range_;
+  int local_range_;
+  PackedTensorAccessor64<scalar_t, 4> gyacc_;
+  PackedTensorAccessor64<scalar_t, 4> gxacc_;
+};
+
+template <typename scalar_t, typename opmath_t, bool is_channels_last>
+struct AdaptiveAvgPool2dBwdSLMKernelFunctor
+    : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t gi = item.get_global_linear_id();
+    int64_t li = item.get_local_id(0);
+
+    // for-loop order: oh*ow->ih->iw
+    // reuse oh*ow(oh0, oh1, ow0, ow1), ih(ikh), iw(ikw) in inner loop.
+    for (int _ih = li; _ih < ih_; _ih += local_range_) {
+      _oh0_cached_[_ih] = (int)native::start_index(_ih, ih_, oh_);
+      _oh1_cached_[_ih] = (int)native::end_index(_ih, ih_, oh_);
+    }
+    for (int _iw = li; _iw < iw_; _iw += local_range_) {
+      _ow0_cached_[_iw] = (int)native::start_index(_iw, iw_, ow_);
+      _ow1_cached_[_iw] = (int)native::end_index(_iw, iw_, ow_);
+    }
+    for (int _oh = li; _oh < oh_; _oh += local_range_) {
+      _ikh_cached_[_oh] = opmath_t(1.0) /
+          (opmath_t)(native::end_index(_oh, oh_, ih_) -
+                     native::start_index(_oh, oh_, ih_));
+    }
+    for (int _ow = li; _ow < ow_; _ow += local_range_) {
+      _ikw_cached_[_ow] = opmath_t(1.0) /
+          (opmath_t)(native::end_index(_ow, ow_, iw_) -
+                     native::start_index(_ow, ow_, iw_));
+    }
+
+    item.barrier(sycl_local_fence);
+
+    for (int64_t i = gi; i < numel_; i += global_range_) {
+      int64_t _iw, _ih, _ic, _ib;
+      if constexpr (is_channels_last) {
+        _ic = i % ic_;
+        _iw = i / ic_ % iw_;
+        _ih = i / ic_ / iw_ % ih_;
+        _ib = i / ic_ / iw_ / ih_;
+      } else {
+        _iw = i % iw_;
+        _ih = i / iw_ % ih_;
+        _ic = i / iw_ / ih_ % ic_;
+        _ib = i / iw_ / ih_ / ic_;
+      }
+
+      int64_t _oh0, _oh1, _ow0, _ow1;
+      _oh0 = _oh0_cached_[_ih];
+      _oh1 = _oh1_cached_[_ih];
+      _ow0 = _ow0_cached_[_iw];
+      _ow1 = _ow1_cached_[_iw];
+      int64_t _ob = _ib;
+      int64_t _oc = _ic;
+
+      opmath_t gx = 0;
+      opmath_t _ikh, _ikw;
+      for (int _oh = _oh0; _oh < _oh1; _oh++) {
+        _ikh = _ikh_cached_[_oh];
+        for (int _ow = _ow0; _ow < _ow1; _ow++) {
+          _ikw = _ikw_cached_[_ow];
+          gx += gyacc_[_ob][_oc][_oh][_ow] * _ikh * _ikw;
+        }
+      }
+
+      const auto store = [](PackedTensorAccessor64<scalar_t, 4> gxacc,
+                            int64_t _ib,
+                            int64_t _ic,
+                            int64_t _ih,
+                            int64_t _iw,
+                            scalar_t res) { gxacc[_ib][_ic][_ih][_iw] = res; };
+      store(gxacc_, _ib, _ic, _ih, _iw, (scalar_t)gx);
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    _oh0_cached_ = sycl_local_acc_t<int>(ih_, cgh);
+    _oh1_cached_ = sycl_local_acc_t<int>(ih_, cgh);
+    _ow0_cached_ = sycl_local_acc_t<int>(iw_, cgh);
+    _ow1_cached_ = sycl_local_acc_t<int>(iw_, cgh);
+    _ikh_cached_ = sycl_local_acc_t<opmath_t>(oh_, cgh);
+    _ikw_cached_ = sycl_local_acc_t<opmath_t>(ow_, cgh);
+  }
+
+  AdaptiveAvgPool2dBwdSLMKernelFunctor(
+      PackedTensorAccessor64<scalar_t, 4> gyacc,
+      PackedTensorAccessor64<scalar_t, 4> gxacc)
+      : gyacc_(gyacc), gxacc_(gxacc) {
+    ib_ = gxacc_.size(0);
+    ic_ = gxacc_.size(1);
+    ih_ = gxacc_.size(2);
+    iw_ = gxacc_.size(3);
+    oh_ = gyacc_.size(2);
+    ow_ = gyacc_.size(3);
+
+    numel_ = ib_ * ic_ * ih_ * iw_;
+    int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
+
+    local_range_ = syclMaxWorkGroupSize();
+    global_range_ = total_item < local_range_
+        ? local_range_
+        : (total_item / local_range_) * local_range_;
+  }
+
+  sycl::range<1> glb_range() {
+    return sycl::range<1>(global_range_);
+  }
+
+  sycl::range<1> loc_range() {
+    return sycl::range<1>(local_range_);
+  }
+
+ private:
+  int ib_;
+  int ic_;
+  int ih_;
+  int iw_;
+  int oh_;
+  int ow_;
+  int64_t numel_;
+  int local_range_;
+  int global_range_;
+  PackedTensorAccessor64<scalar_t, 4> gyacc_;
+  PackedTensorAccessor64<scalar_t, 4> gxacc_;
+  sycl_local_acc_t<int> _oh0_cached_;
+  sycl_local_acc_t<int> _oh1_cached_;
+  sycl_local_acc_t<int> _ow0_cached_;
+  sycl_local_acc_t<int> _ow1_cached_;
+  sycl_local_acc_t<opmath_t> _ikh_cached_;
+  sycl_local_acc_t<opmath_t> _ikw_cached_;
+};
+
+void adaptive_avg_pool2d_backward_kernel(
+    Tensor& grad_input,
+    const Tensor& grad_output_,
+    const Tensor& input_) {
+  Tensor input, grad_output;
+  if (input_.ndimension() == 3) {
+    input = input_.contiguous();
+    grad_output = grad_output_.contiguous();
+    grad_input = at::empty_like(input);
+  } else {
+    auto smf = input_.suggest_memory_format();
+    input = input_.contiguous(smf);
+    grad_output = grad_output_.contiguous(smf);
+    grad_input = at::empty_like(input_, smf);
+  }
+
+  auto outputHeight = grad_output.size(-2);
+  auto outputWidth = grad_output.size(-1);
+
+  const auto nInputPlane = input.size(-3);
+  const auto inputHeight = input.size(-2);
+  const auto inputWidth = input.size(-1);
+
+  int dH = std::floor((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int dW = std::floor((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> stride_vec = {dH, dW};
+
+  int kH = std::ceil((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int kW = std::ceil((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> kernel_size_vec = {kH, kW};
+
+  int padH = (dH * (outputHeight - 1) + kH - inputHeight) / 2;
+  int padW = (dW * (outputWidth - 1) + kW - inputWidth) / 2;
+  std::vector<int64_t> padding_vec = {padH, padW};
+
+  bool is_3d = grad_output.ndimension() == 3;
+  if (is_3d) {
+    grad_output.resize_({1, nInputPlane, outputHeight, outputWidth});
+    grad_input.resize_({1, nInputPlane, inputHeight, inputWidth});
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      grad_output.scalar_type(),
+      "adaptive_avg_pool2d_backward_xpu",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto gyacc = grad_output.packed_accessor64<scalar_t, 4>();
+        auto gxacc = grad_input.packed_accessor64<scalar_t, 4>();
+
+        int64_t ohw01_shared_size =
+            ((inputHeight + inputWidth) * 2) * sizeof(int);
+        int64_t ikhw_shared_size =
+            (outputHeight + outputWidth) * sizeof(opmath_t);
+        bool using_shared =
+            syclLocalMemSize() >= ohw01_shared_size + ikhw_shared_size;
+
+        auto& q = getCurrentSYCLQueue();
+        if (is_smf_channels_last(grad_output)) {
+          if (using_shared) {
+            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, opmath_t, true> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          } else {
+            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, opmath_t, true> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          }
+        } else {
+          if (using_shared) {
+            AdaptiveAvgPool2dBwdSLMKernelFunctor<scalar_t, opmath_t, false> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          } else {
+            AdaptiveAvgPool2dBwdKernelFunctor<scalar_t, opmath_t, false> kfn(
+                gyacc, gxacc);
+            sycl_kernel_submit(kfn.glb_range(), kfn.loc_range(), q, kfn);
+          }
+        }
+      });
+
+  if (is_3d) {
+    grad_output.resize_({nInputPlane, outputHeight, outputWidth});
+    grad_input.resize_({nInputPlane, inputHeight, inputWidth});
+  }
+}
+
+template <typename scalar_t, typename opmath_t, bool is_channels_last>
+struct AdaptiveAvgPool2dKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    int64_t gi = item.get_global_linear_id();
+    for (int64_t i = gi; i < numel_; i += global_range_) {
+      int64_t _ow, _oh, _oc, _ob;
+      if constexpr (is_channels_last) {
+        _oc = i % oc_;
+        _ow = i / oc_ % ow_;
+        _oh = i / oc_ / ow_ % oh_;
+        _ob = i / oc_ / ow_ / oh_;
+      } else {
+        _ow = i % ow_;
+        _oh = i / ow_ % oh_;
+        _oc = i / ow_ / oh_ % oc_;
+        _ob = i / ow_ / oh_ / oc_;
+      }
+
+      int64_t _ih0 = native::start_index(_oh, oh_, ih_);
+      int64_t _ih1 = native::end_index(_oh, oh_, ih_);
+      int64_t _iw0 = native::start_index(_ow, ow_, iw_);
+      int64_t _iw1 = native::end_index(_ow, ow_, iw_);
+      int64_t kh = _ih1 - _ih0;
+      int64_t kw = _iw1 - _iw0;
+      int64_t _ib = _ob;
+      int64_t _ic = _oc;
+
+      opmath_t sum = static_cast<opmath_t>(0);
+      for (int _ih = _ih0; _ih < _ih1; _ih++) {
+        for (int _iw = _iw0; _iw < _iw1; _iw++) {
+          sum += opmath_t(input_[_ib][_ic][_ih][_iw]);
+        }
+      }
+      opmath_t avg = sum / kh / kw;
+
+      const auto store = [](PackedTensorAccessor64<scalar_t, 4> oacc,
+                            int64_t _ob,
+                            int64_t _oc,
+                            int64_t _oh,
+                            int64_t _ow,
+                            scalar_t res) { oacc[_ob][_oc][_oh][_ow] = res; };
+      store(output_, _ob, _oc, _oh, _ow, avg);
+    }
+  }
+  AdaptiveAvgPool2dKernelFunctor(
+      int ih,
+      int iw,
+      int ob,
+      int oc,
+      int oh,
+      int ow,
+      int64_t numel,
+      int global_range,
+      PackedTensorAccessor64<scalar_t, 4> input,
+      PackedTensorAccessor64<scalar_t, 4> output)
+      : ih_(ih),
+        iw_(iw),
+        ob_(ob),
+        oc_(oc),
+        oh_(oh),
+        ow_(ow),
+        numel_(numel),
+        global_range_(global_range),
+        input_(input),
+        output_(output) {}
+
+ private:
+  int ih_;
+  int iw_;
+  int ob_;
+  int oc_;
+  int oh_;
+  int ow_;
+  int64_t numel_;
+  int global_range_;
+  PackedTensorAccessor64<scalar_t, 4> input_;
+  PackedTensorAccessor64<scalar_t, 4> output_;
+};
+
+template <typename scalar_t, typename opmath_t, bool is_channels_last>
+void launch_adaptive_avg_pool2d_kernel(
+    PackedTensorAccessor64<scalar_t, 4> input,
+    PackedTensorAccessor64<scalar_t, 4> output) {
+  int ih = input.size(2);
+  int iw = input.size(3);
+  int ob = output.size(0);
+  int oc = output.size(1);
+  int oh = output.size(2);
+  int ow = output.size(3);
+
+  int64_t numel = ob * oc * oh * ow;
+  int total_item = std::min(numel, syclMaxWorkItemsPerTile());
+  int local_range = syclMaxWorkItemsPerEU();
+  int global_range = total_item < local_range
+      ? local_range
+      : ((total_item + local_range - 1) / local_range) * local_range;
+  auto caller =
+      AdaptiveAvgPool2dKernelFunctor<scalar_t, opmath_t, is_channels_last>(
+          ih, iw, ob, oc, oh, ow, numel, global_range, input, output);
+  sycl_kernel_submit(
+      sycl::range<1>(global_range),
+      sycl::range<1>(local_range),
+      getCurrentSYCLQueue(),
+      caller);
+}
+
+void adaptive_avg_pool2d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size) {
+  auto outputWidth = output_size[1];
+  auto outputHeight = output_size[0];
+
+  if (!input.is_quantized() && outputWidth == 1 && outputHeight == 1) {
+    // in this case, adaptive pooling is just computing mean over hw
+    // dimensions, which can be done more efficiently
+
+    output = input.mean({-1, -2}, /* keepdim = */ true);
+    if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) {
+      // assert ndim == 4, since ndim = 3 doesn't give channels_last
+      const int n = input.size(0);
+      const int c = input.size(1);
+      output.as_strided_({n, c, 1, 1}, {c, 1, c, c});
+    }
+    return;
+  }
+
+  /* sizes */
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const auto nInputPlane = input.size(-3);
+  const auto inputHeight = input.size(-2);
+  const auto inputWidth = input.size(-1);
+  Tensor input_;
+  if (input.ndimension() == 3) {
+    input_ = input.contiguous();
+    output.resize_({nInputPlane, outputHeight, outputWidth});
+  } else {
+    auto smf = input.suggest_memory_format();
+    input_ = input.contiguous(smf);
+    output.resize_({nbatch, nInputPlane, outputHeight, outputWidth}, smf);
+  }
+  if (output.numel() == 0) {
+    return;
+  }
+  int dH = std::floor((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int dW = std::floor((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> stride_vec = {dH, dW};
+
+  int kH = std::ceil((float)2 * inputHeight / outputHeight) -
+      (inputHeight / outputHeight);
+  int kW = std::ceil((float)2 * inputWidth / outputWidth) -
+      (inputWidth / outputWidth);
+  std::vector<int64_t> kernel_size_vec = {kH, kW};
+
+  int padH = (dH * (outputHeight - 1) + kH - inputHeight) / 2;
+  int padW = (dW * (outputWidth - 1) + kW - inputWidth) / 2;
+  std::vector<int64_t> padding_vec = {padH, padW};
+
+  bool is_3d = input_.ndimension() == 3;
+  if (is_3d) {
+    input_.resize_({1, nInputPlane, inputHeight, inputWidth});
+    output.resize_({1, nInputPlane, outputHeight, outputWidth});
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      input_.scalar_type(),
+      "adaptive_avg_pool2d_xpu",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto iacc = input_.packed_accessor64<scalar_t, 4>();
+        auto oacc = output.packed_accessor64<scalar_t, 4>();
+        if (is_smf_channels_last(output)) {
+          launch_adaptive_avg_pool2d_kernel<scalar_t, opmath_t, true>(
+              iacc, oacc);
+        } else {
+          launch_adaptive_avg_pool2d_kernel<scalar_t, opmath_t, false>(
+              iacc, oacc);
+        }
+      });
+
+  if (is_3d) {
+    input_.resize_({nInputPlane, inputHeight, inputWidth});
+    output.resize_({nInputPlane, outputHeight, outputWidth});
+  }
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
new file mode 100644
index 000000000..9b6d9a046
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void adaptive_avg_pool2d_backward_kernel(
+    Tensor& gradInput,
+    const Tensor& gradOutput,
+    const Tensor& input);
+
+void adaptive_avg_pool2d_kernel(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size);
+
+} // namespace at::native::xpu
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 751385d7a..c66f7bf6c 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -10,6 +10,8 @@ supported:
   - add_.Scalar
   - add.Scalar_out
   - _adaptive_avg_pool2d_backward
+  - adaptive_avg_pool2d.out
+  - _adaptive_avg_pool2d
   - cumsum
   - cumsum.out
   - cumsum_

From 662ab6a36a13b4b357ca2ccbf888ffaac4da7c9f Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Thu, 4 Jul 2024 08:56:33 +0800
Subject: [PATCH 04/20] Add aten::_efficientzerotensor and aten::complex (#441)

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/TensorFactories.cpp     | 69 +++++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template    |  2 -
 src/ATen/native/xpu/sycl/ComplexKernels.cpp | 23 +++++++
 src/ATen/native/xpu/sycl/ComplexKernels.h   |  9 +++
 yaml/xpu_functions.yaml                     |  2 +
 5 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 src/ATen/native/xpu/sycl/ComplexKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/ComplexKernels.h

diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index 8b4f1b8d0..d7b79902f 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorFactories.h>
+#include <c10/xpu/XPUFunctions.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -11,6 +12,7 @@
 #include <ATen/ops/empty_strided_native.h>
 #endif
 
+#include <ATen/native/xpu/sycl/ComplexKernels.h>
 #include <ATen/native/xpu/sycl/RandpermKernel.h>
 #include <ATen/xpu/EmptyTensor.h>
 
@@ -63,6 +65,73 @@ Tensor XPUNativeFunctions::clone(
   return at::native::clone(self, memory_format);
 }
 
+Tensor XPUNativeFunctions::_efficientzerotensor(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
+  auto device_ = device_or_default(device);
+  if (!device_.has_index()) {
+    device_.set_index(c10::xpu::current_device());
+  }
+  auto allocator = at::native::ZeroTensorAllocator(device_);
+  auto dtype_ = dtype_or_default(dtype);
+  auto zero_ks = at::DispatchKeySet(c10::DispatchKey::XPU) |
+      at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
+  auto out = at::detail::empty_generic(
+      size, &allocator, zero_ks, dtype_, c10::nullopt);
+  return out;
+}
+
+static void complex_check_floating(const Tensor& a, const Tensor& b) {
+  TORCH_CHECK(
+      (a.scalar_type() == kFloat || a.scalar_type() == kDouble ||
+       a.scalar_type() == kHalf) &&
+          (b.scalar_type() == kFloat || b.scalar_type() == kDouble ||
+           b.scalar_type() == kHalf),
+      "Expected both inputs to be Half, Float or Double tensors but got ",
+      a.scalar_type(),
+      " and ",
+      b.scalar_type());
+}
+
+static void complex_check_dtype(
+    const Tensor& result,
+    const Tensor& a,
+    const Tensor& b) {
+  complex_check_floating(a, b);
+  TORCH_CHECK(
+      a.scalar_type() == b.scalar_type(),
+      "Expected object of scalar type ",
+      a.scalar_type(),
+      " but got scalar type ",
+      b.scalar_type(),
+      " for second argument");
+  TORCH_CHECK(
+      result.scalar_type() == toComplexType(a.scalar_type()),
+      "Expected object of scalar type ",
+      toComplexType(a.scalar_type()),
+      " but got scalar type ",
+      result.scalar_type(),
+      " for argument 'out'");
+}
+
+Tensor& XPUNativeFunctions::complex_out(
+    const Tensor& real,
+    const Tensor& imag,
+    Tensor& result) {
+  complex_check_dtype(result, real, imag);
+  auto iter = TensorIteratorConfig()
+                  .add_output(result)
+                  .add_const_input(real)
+                  .add_const_input(imag)
+                  .check_all_same_dtype(false)
+                  .build();
+  native::xpu::complex_kernel(iter);
+  return result;
+}
+
 Tensor& XPUNativeFunctions::randperm_out(
     int64_t n,
     c10::optional<Generator> generator,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 1e7318f4f..a9a3020a8 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -186,7 +186,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "cholesky",
     "cholesky_inverse",
     "_cholesky_solve_helper",
-    "complex.out",
     "conj_physical.out",
     "copysign.out",
     "cosh.out",
@@ -199,7 +198,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "digamma.out",
     "dot",
     "_efficient_attention_forward",
-    "_efficientzerotensor",
     "_embedding_bag_dense_backward",
     "_embedding_bag_per_sample_weights_backward",
     "equal",
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
new file mode 100644
index 000000000..686f5e2d3
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -0,0 +1,23 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/TensorIterator.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct ComplexFunctor {
+  c10::complex<scalar_t> operator()(scalar_t a, scalar_t b) const {
+    return c10::complex<scalar_t>(a, b);
+  }
+};
+
+void complex_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_xpu", [&]() {
+    ComplexFunctor<scalar_t> f;
+    gpu_kernel(iter, f);
+  });
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.h b/src/ATen/native/xpu/sycl/ComplexKernels.h
new file mode 100644
index 000000000..990bcd14e
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void complex_kernel(TensorIterator& iter);
+
+} // namespace at::native::xpu
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index c66f7bf6c..6000704ac 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -190,6 +190,8 @@ supported:
   - exp_
   - empty.memory_format
   - empty_strided
+  - _efficientzerotensor
+  - complex.out
   - clone
   - fill_.Scalar
   - fill_.Tensor

From 41ac6a36a48ed36252ebcffad6d46c3d54fe443f Mon Sep 17 00:00:00 2001
From: Yutao Xu <yutao.xu@intel.com>
Date: Thu, 4 Jul 2024 09:00:14 +0800
Subject: [PATCH 05/20] Add aten::trace (#446)

Add aten::trace

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/TriangluarOps.cpp    | 6 ++++++
 src/ATen/native/xpu/XPUFallback.template | 1 -
 test/xpu/xpu_test_utils.py               | 1 +
 yaml/xpu_functions.yaml                  | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/TriangluarOps.cpp b/src/ATen/native/xpu/TriangluarOps.cpp
index 6b5428e6c..affba5665 100644
--- a/src/ATen/native/xpu/TriangluarOps.cpp
+++ b/src/ATen/native/xpu/TriangluarOps.cpp
@@ -68,4 +68,10 @@ Tensor& XPUNativeFunctions::triu_(Tensor& self, int64_t diagonal) {
   xpu::check_inplace(self, self.sizes(), self.options());
   return triu_out(self, diagonal, self);
 }
+
+Tensor XPUNativeFunctions::trace(const Tensor& self) {
+  TORCH_CHECK(self.dim() == 2, "expected a matrix");
+  return self.diagonal().sum();
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index a9a3020a8..a9cfa7206 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -370,7 +370,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "topk.values",
     "_to_sparse",
     "_to_sparse_csr",
-    "trace",
     "triangular_solve.X",
     "tril_indices",
     "triu_indices",
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 61a2c6b0d..fbb2dc73c 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -27,6 +27,7 @@
     "view_as_real",
     "view_as_complex",
     "view",
+    "trace",
     "resize_",
     "resize_as_",
     "add",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 6000704ac..f4ce0960f 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -453,6 +453,7 @@ supported:
   - _cdist_forward
   - _pin_memory
   - is_pinned
+  - trace
   - reflection_pad2d
   - reflection_pad2d.out
   - reflection_pad2d_backward

From 3316f9f9a2179cc3fc797ba487e236c60af0b81b Mon Sep 17 00:00:00 2001
From: mengfei25 <mengfei.li@Intel.com>
Date: Thu, 4 Jul 2024 14:59:57 +0800
Subject: [PATCH 06/20] Enhance E2E test for multiple cards shard (#524)

---
 .github/actions/inductor-xpu-e2e-test/action.yml | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
index 5647da77e..6e1dd4268 100644
--- a/.github/actions/inductor-xpu-e2e-test/action.yml
+++ b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -110,14 +110,11 @@ runs:
                 contains "accuracy,performance" $scenario
                 $contains_status
                 if [ "${MODEL_ONLY_NAME}" == "" ];then
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 8 0 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 1 static 8 1 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 2 static 8 2 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 3 static 8 3 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 4 static 8 4 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 5 static 8 5 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 6 static 8 6 &
-                  bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 7 static 8 7 &
+                  xpu_list=($(xpu-smi discovery |grep 'DRM Device: /dev/' |sed 's/.*card//;s/[^0-9].*//' |awk '{print $1 - 1":"NR - 1}'))
+                  for xpu_id in ${xpu_list[*]}
+                  do
+                    bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu ${xpu_id/:*} static ${#xpu_list[*]} ${xpu_id/*:} &
+                  done
                 else
                   bash inductor_xpu_test.sh ${suite} ${dt} ${mode} ${scenario} xpu 0 static 1 0 ${MODEL_ONLY_NAME} &
                 fi

From c5cad200260700d21015cf2fb4ba070a203da3ad Mon Sep 17 00:00:00 2001
From: chunhuanMeng <105194461+chunhuanMeng@users.noreply.github.com>
Date: Thu, 4 Jul 2024 18:53:09 +0800
Subject: [PATCH 07/20] Add skip case (#538)

Not implemented operators, aten::_embedding_bag_backward.
 To retrieve cases when the operators are supported.
 https://github.com/intel/torch-xpu-ops/issues/536
"test_backward_nn_functional_embedding_bag_xpu_float32",
---
 test/xpu/extended/run_test_with_skip.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 560e23830..c7c2ff404 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -82,6 +82,11 @@
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
 
+    # Not implemented operators, aten::_embedding_bag_backward.
+    # To retrieve cases when the operators are supported.
+    # https://github.com/intel/torch-xpu-ops/issues/536
+    "test_backward_nn_functional_embedding_bag_xpu_float32",
+
     #Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
 

From d286fd8cd62060bb9e5807de73c45363361b62f0 Mon Sep 17 00:00:00 2001
From: "Huaiyu, Zheng" <huaiyu.zheng@intel.com>
Date: Fri, 5 Jul 2024 08:48:54 +0800
Subject: [PATCH 08/20] Add aten::bucketize and aten::searchsorted (#447)

add bucketize searchsorted

---------

Co-authored-by: chunhuanMeng <chunhuan.meng@intel.com>
Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/Bucketization.cpp         | 125 +++++++++
 src/ATen/native/xpu/LinearAlgebra.cpp         |   8 +-
 src/ATen/native/xpu/TensorFactories.cpp       |   2 +-
 src/ATen/native/xpu/XPUFallback.template      |   2 -
 .../native/xpu/sycl/BucketizationKernels.cpp  | 246 ++++++++++++++++++
 .../native/xpu/sycl/BucketizationKernels.h    |  12 +
 src/ATen/native/xpu/sycl/ComplexKernels.cpp   |   9 +-
 .../native/xpu/sycl/LinearAlgebraKernels.cpp  |   3 -
 .../native/xpu/sycl/LinearAlgebraKernels.h    |   1 -
 test/xpu/xpu_test_utils.py                    |   2 +
 yaml/xpu_functions.yaml                       |   7 +
 11 files changed, 399 insertions(+), 18 deletions(-)
 create mode 100644 src/ATen/native/xpu/Bucketization.cpp
 create mode 100644 src/ATen/native/xpu/sycl/BucketizationKernels.cpp
 create mode 100644 src/ATen/native/xpu/sycl/BucketizationKernels.h

diff --git a/src/ATen/native/xpu/Bucketization.cpp b/src/ATen/native/xpu/Bucketization.cpp
new file mode 100644
index 000000000..0d6c2a9f5
--- /dev/null
+++ b/src/ATen/native/xpu/Bucketization.cpp
@@ -0,0 +1,125 @@
+#include <ATen/native/BucketizationUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/xpu/sycl/BucketizationKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+namespace at {
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> sorter_maybe_owned =
+      at::borrow_from_optional_tensor(sorter_opt);
+  const Tensor& sorter = *sorter_maybe_owned;
+  at::native::searchsorted_pre_check(
+      sorted_sequence, self, result, out_int32, right, side_opt, sorter);
+  at::native::resize_output(result, self.sizes());
+
+  if (self.numel() == 0) {
+    return result;
+  }
+
+  // we have two inputs to set right, pre_check checks that they aren't set to
+  // opposites
+  bool is_right = (side_opt && *side_opt == "right") || right;
+  at::native::xpu::searchsorted_kernel(
+      result, self, sorted_sequence, out_int32, is_right, sorter);
+  return result;
+}
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_out(
+      sorted_sequence,
+      scalar_tensor,
+      out_int32,
+      right,
+      side_opt,
+      sorter_opt,
+      result);
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  searchsorted_out(
+      sorted_sequence, self, out_int32, right, side_opt, sorter, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted(
+      sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
+}
+
+Tensor& XPUNativeFunctions::bucketize_out(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& result) {
+  TORCH_CHECK(
+      boundaries.dim() == 1,
+      "boundaries tensor must be 1 dimension, but got dim(",
+      boundaries.dim(),
+      ")");
+  searchsorted_out(
+      boundaries, self, out_int32, right, nullopt, nullopt, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  bucketize_out(self, boundaries, out_int32, right, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  return bucketize(
+      at::native::searchsorted_scalar_tensor(self, boundaries.device()),
+      boundaries,
+      out_int32,
+      right);
+}
+} // namespace at
diff --git a/src/ATen/native/xpu/LinearAlgebra.cpp b/src/ATen/native/xpu/LinearAlgebra.cpp
index dfc7bd70b..2f857f18b 100644
--- a/src/ATen/native/xpu/LinearAlgebra.cpp
+++ b/src/ATen/native/xpu/LinearAlgebra.cpp
@@ -1,15 +1,9 @@
 #include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/WrapDimUtils.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/utils/ParamUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index d7b79902f..ee29aa167 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -1,8 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorFactories.h>
-#include <c10/xpu/XPUFunctions.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
+#include <c10/xpu/XPUFunctions.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index a9cfa7206..f8b3db5de 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -178,7 +178,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "binary_cross_entropy_backward",
     "bitwise_left_shift.Tensor_out",
     "bitwise_right_shift.Tensor_out",
-    "bucketize.Tensor",
     "cauchy_",
     "_cdist_backward",
     "ceil.out",
@@ -322,7 +321,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "__rshift__.Scalar",
     "_scaled_dot_product_efficient_attention",
     "_scaled_mm",
-    "searchsorted.Tensor",
     "segment_reduce",
     "_segment_reduce_backward",
     "signbit.out",
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
new file mode 100644
index 000000000..213283de0
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -0,0 +1,246 @@
+#include <ATen/ATen.h>
+#include <ATen/native/BucketizationUtils.h>
+#include <comm/SYCLContext.h>
+
+namespace at::native::xpu {
+
+// customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be
+// the end of boundary and we can properly handle a sorter argument
+// std::lower_bound can not be used here since its customized comparator need
+// strict weak ordering and the customized comparators require both arguments to
+// have the same type, which wouldn't happen when comparing val of input_t to an
+// indexer value from sorter of int64
+template <typename input_t>
+int64_t cus_lower_bound(
+    int64_t start,
+    int64_t end,
+    const input_t val,
+    const input_t* bd,
+    const int64_t* sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add
+  // the non-updated start as an offset i.e. the second row of a 3x3 tensors
+  // starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid];
+    if (!(mid_val >= val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+// customized upper_bound func to ensure we can properly handle a sorter
+// argument std::upper_bound can not be used here since its customized
+// comparator requires both arguments to have the same type, which wouldn't
+// happen when comparing val of input_t to an indexer value from sorter of int64
+template <typename input_t>
+int64_t cus_upper_bound(
+    int64_t start,
+    int64_t end,
+    const input_t val,
+    const input_t* bd,
+    const int64_t* sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add
+  // the non-updated start as an offset i.e. the second row of a 3x3 tensors
+  // starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid];
+    if (!(mid_val > val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename input_t, typename output_t>
+struct SearchsortedKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    for (int64_t i = item.get_global_id(0); i < numel_in_;
+         i += item.get_global_range()[0]) {
+      // If boundaries tensor is 1d, we always search the entire boundary
+      // tensor
+      int64_t start_bd = is_1d_boundaries_ ? 0 : i / idim_in_ * idim_bd_;
+      int64_t end_bd = start_bd + idim_bd_;
+
+      int64_t pos = !right_
+          ? cus_lower_bound(
+                start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) -
+              start_bd
+          : cus_upper_bound(
+                start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) -
+              start_bd;
+
+      // type conversion might happen here
+      data_out_data_[i] = pos;
+    }
+  }
+
+  SearchsortedKernelFunctor(
+      const bool right,
+      int64_t numel_in,
+      int64_t idim_in,
+      int64_t idim_bd,
+      const int64_t* data_st,
+      output_t* data_out,
+      bool is_1d_boundaries,
+      input_t* data_in_data,
+      input_t* data_bd_data,
+      output_t* data_out_data)
+      : right_(right),
+        numel_in_(numel_in),
+        idim_in_(idim_in),
+        idim_bd_(idim_bd),
+        data_st_(data_st),
+        data_out_(data_out),
+        is_1d_boundaries_(is_1d_boundaries),
+        data_in_data_(data_in_data),
+        data_bd_data_(data_bd_data),
+        data_out_data_(data_out_data) {}
+
+ private:
+  const bool right_;
+  int64_t numel_in_;
+  int64_t idim_in_;
+  int64_t idim_bd_;
+  const int64_t* data_st_;
+  output_t* data_out_;
+  bool is_1d_boundaries_;
+  input_t* data_in_data_;
+  input_t* data_bd_data_;
+  output_t* data_out_data_;
+};
+template <typename input_t, typename output_t>
+void searchsorted_template(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& boundaries,
+    const bool& right,
+    const Tensor& sorter) {
+  int64_t numel_in = input.numel();
+  int64_t rng, grng, tile_size;
+  tile_size = syclMaxWorkGroupSize();
+  rng = numel_in;
+  if (rng == 0) {
+    rng = static_cast<int64_t>(1);
+  }
+
+  grng = rng;
+  if (tile_size > grng) {
+    tile_size = grng;
+  } else if (grng > tile_size) {
+    int64_t xMode = static_cast<int64_t>(grng % tile_size);
+    if (xMode != 0) {
+      grng += static_cast<int64_t>(tile_size - xMode);
+    }
+  }
+
+  bool is_scalar_input = input.dim() == 0 && numel_in == 1;
+  // inner most dim size of input and boundaries
+  int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
+  int64_t idim_bd = boundaries.sizes().back();
+
+  const int64_t* data_st =
+      sorter.defined() ? sorter.data_ptr<int64_t>() : nullptr;
+  output_t* data_out = result.data_ptr<output_t>();
+
+  bool is_1d_boundaries = boundaries.dim() == 1;
+  auto data_in_data = input.data_ptr<input_t>();
+  auto data_bd_data = boundaries.data_ptr<input_t>();
+  auto data_out_data = result.data_ptr<output_t>();
+  SearchsortedKernelFunctor<input_t, output_t> kfn(
+      right,
+      numel_in,
+      idim_in,
+      idim_bd,
+      data_st,
+      data_out,
+      is_1d_boundaries,
+      data_in_data,
+      data_bd_data,
+      data_out_data);
+
+  sycl_kernel_submit(grng, tile_size, getCurrentSYCLQueue(), kfn);
+}
+
+void searchsorted_dispatch(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter) {
+  if (!out_int32) {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_xpu",
+        [&] {
+          searchsorted_template<scalar_t, int64_t>(
+              result, input, boundaries, right, sorter);
+        });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_xpu",
+        [&] {
+          searchsorted_template<scalar_t, int>(
+              result, input, boundaries, right, sorter);
+        });
+  }
+}
+
+void searchsorted_kernel(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& sorted_sequence,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter) {
+  // for non-contiguous result tensors, we write the output to a contiguous copy
+  // so we can later copy back, maintaining the original result tensor
+  Tensor out = result;
+  if (!result.is_contiguous()) {
+    out = result.contiguous();
+  }
+  if (sorted_sequence.is_contiguous() && input.is_contiguous() &&
+      sorted_sequence.dtype() == input.dtype() && sorter.is_contiguous()) {
+    searchsorted_dispatch(
+        out, input, sorted_sequence, out_int32, right, sorter);
+  } else {
+    Tensor trimmed_input;
+    Tensor trimmed_boundaries;
+    Tensor trimmed_sorter;
+    at::native::searchsorted_maybe_trim_input_tensors(
+        trimmed_input,
+        trimmed_boundaries,
+        trimmed_sorter,
+        input,
+        sorted_sequence,
+        sorter);
+    const Tensor& final_input = trimmed_input.defined() ? trimmed_input : input;
+    const Tensor& final_boundaries =
+        trimmed_boundaries.defined() ? trimmed_boundaries : sorted_sequence;
+    const Tensor& final_sorter =
+        trimmed_sorter.defined() ? trimmed_sorter : sorter;
+    searchsorted_dispatch(
+        out, final_input, final_boundaries, out_int32, right, final_sorter);
+  }
+
+  // if result is non-contiguous, we wrote the answer to a copied version, so we
+  // copy back to the original result tensor
+  if (!result.is_contiguous()) {
+    result.copy_(out);
+  }
+}
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h
new file mode 100644
index 000000000..f47cea2af
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+void searchsorted_kernel(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& sorted_sequence,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter);
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
index 686f5e2d3..56b25d0ef 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -14,10 +14,11 @@ struct ComplexFunctor {
 };
 
 void complex_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_xpu", [&]() {
-    ComplexFunctor<scalar_t> f;
-    gpu_kernel(iter, f);
-  });
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      kHalf, iter.input_dtype(0), "complex_xpu", [&]() {
+        ComplexFunctor<scalar_t> f;
+        gpu_kernel(iter, f);
+      });
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
index 6e117ce61..8d3128e9e 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
@@ -1,7 +1,4 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
-#include <c10/core/ScalarType.h>
-
 #include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
index fcfdad46e..32f987a2e 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <ATen/native/TensorIterator.h>
-#include <c10/core/ScalarType.h>
 
 namespace at::native::xpu {
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index fbb2dc73c..6f296cbd0 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -138,6 +138,8 @@
     "sigmoid",
     "sgn",
     "nn.functional.embedding_bag",
+    "bucketize",
+    "searchsorted",
     "grid_sampler_2d",
     # "nn.functional.grid_sample", # Lack of XPU implementation of aten::grid_sampler_3d.
     "acos",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index f4ce0960f..d8cc5825b 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -453,6 +453,13 @@ supported:
   - _cdist_forward
   - _pin_memory
   - is_pinned
+  - bucketize.Tensor
+  - bucketize.Tensor_out
+  - bucketize.Scalar
+  - searchsorted.Tensor
+  - searchsorted.Tensor_out
+  - searchsorted.Scalar
+  - searchsorted.Scalar_out
   - trace
   - reflection_pad2d
   - reflection_pad2d.out

From 3fc911b8f9f29225b0772df583cf573b1d298b10 Mon Sep 17 00:00:00 2001
From: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:22:39 +0800
Subject: [PATCH 09/20] Add UT test for nightly (#515)

---
 .github/workflows/_linux_ut.yml               | 123 ++++++++++++++++++
 .../workflows/inductor_xpu_e2e_nightly.yml    |  24 ++++
 .github/workflows/pull.yml                    |  86 +-----------
 3 files changed, 152 insertions(+), 81 deletions(-)
 create mode 100644 .github/workflows/_linux_ut.yml

diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
new file mode 100644
index 000000000..7cf2746c3
--- /dev/null
+++ b/.github/workflows/_linux_ut.yml
@@ -0,0 +1,123 @@
+name: inductor-xpu-ut-test
+
+on:
+  workflow_call:
+    inputs:
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Set runner
+
+
+jobs:
+  Inductor-XPU-UT-Tests:
+    runs-on: ${{ inputs.runner }} 
+    timeout-minutes: 900
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          cd ../ && rm -rf pytorch
+          git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch
+          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
+          if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          else
+            if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            else
+              echo "Not update torch-xpu-ops"
+            fi
+          fi
+          # Workaround for torch-xpu-ops ci test
+          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+      - name: Build Pytorch XPU
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          conda install -c intel mkl-static mkl-include -y
+          cd ../pytorch
+          pip install -r requirements.txt
+          export USE_XPU=1
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Run XPU OP Examples
+        if: contains(inputs.ut_suite, 'op_example')
+        run: |
+          cd ${{ github.workspace }}
+          mkdir -p ut_log
+          xpu-smi discovery
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ${{ github.workspace }}
+          cd examples
+          pip install pytest
+          timeout 8000 pytest -v 
+      - name: Run XPU OP Extended UT
+        if: contains(inputs.ut_suite, 'op_extended')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
+          timeout 10000 python run_test_with_skip.py 
+      - name: Run XPU OP UT
+        if: contains(inputs.ut_suite, 'op_ut')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          timeout 10000 python run_test_with_skip.py 
+          # Cases run with a on-demand white list, since some suites are too
+          # slow to go through all operators on CPU. So add cases on-demand
+          # when XPU implementatoin is done.
+          # test_foreach, test_decomp
+          timeout 10000 python run_test_with_only.py 
+      - name: Run Torch XPU UT
+        if: contains(inputs.ut_suite, 'torch_xpu')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TEST_REPORTS_DIR=$(pwd)/test/test-reports
+          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
+          # Run Pytorch XPU binary UT
+          for xpu_case in build/bin/*{xpu,sycl}*; do
+            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
+              case_name=$(basename "$xpu_case")
+              echo "Testing ${case_name} ..."
+              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 
+            fi
+          done
+          # Run Pytorch XPU python UT
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
+          python test/run_test.py --xpu 
+
diff --git a/.github/workflows/inductor_xpu_e2e_nightly.yml b/.github/workflows/inductor_xpu_e2e_nightly.yml
index 8307edae7..a8d316580 100644
--- a/.github/workflows/inductor_xpu_e2e_nightly.yml
+++ b/.github/workflows/inductor_xpu_e2e_nightly.yml
@@ -41,6 +41,21 @@ on:
         type: string
         default: ''
         description: If set, will only launch this one
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
 
 
 permissions: read-all
@@ -244,6 +259,15 @@ jobs:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
 
+  Inductor-XPU-UT-Nightly-Tests:
+    if: ${{ inputs.ut_suite }}
+    name: Nightly Inductor XPU UT Test
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: ${{ inputs.ut_suite }}
+      pytorch_branch: ${{ inputs.pytorch_branch }}
+      runner: linux.idc.xpu
+      
   Tests-Failure-And-Report:
     if: always()
     runs-on: pvc_e2e
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index bd65effa9..1bd635d1a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -23,84 +23,8 @@ jobs:
     # Don't run on forked repos and draft PRs
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     name: preci-ut
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v3
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../ && rm -rf pytorch
-          git clone -b main https://github.com/pytorch/pytorch
-          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-      - name: Build Pytorch XPU
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          conda install -c intel mkl-static mkl-include -y
-          cd ../pytorch
-          pip install -r requirements.txt
-          export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
-          pip install --force-reinstall dist/*.whl
-          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Run XPU OP Examples
-        if: ${{ hashFiles('examples/') != '' }}
-        run: |
-          xpu-smi discovery
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd examples
-          pip install pytest
-          timeout 8000 pytest -v
-      - name: Run XPU OP Extended UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
-          timeout 10000 python run_test_with_skip.py
-      - name: Run XPU OP UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          timeout 10000 python run_test_with_skip.py
-          # Cases run with a on-demand white list, since some suites are too
-          # slow to go through all operators on CPU. So add cases on-demand
-          # when XPU implementatoin is done.
-          # test_foreach, test_decomp
-          timeout 10000 python run_test_with_only.py
-      - name: Run Torch XPU UT
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TEST_REPORTS_DIR=$(pwd)/test/test-reports
-          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
-          # Run Pytorch XPU binary UT
-          for xpu_case in build/bin/*{xpu,sycl}*; do
-            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-              case_name=$(basename "$xpu_case")
-              echo "Testing ${case_name} ..."
-              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-            fi
-          done
-          # Run Pytorch XPU python UT
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
-          python test/run_test.py --xpu
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: op_example,op_extended,op_ut,torch_xpu
+      runner: linux.idc.xpu
+      
\ No newline at end of file

From 6781c4a0b56500610ed9d0647d9172dac4bb436c Mon Sep 17 00:00:00 2001
From: hjhee <hjhee@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:54:44 +0800
Subject: [PATCH 10/20] Add aten::ceil (#463)

- ceil.out
  - ceil
  - ceil_

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/UnaryOps.cpp              | 36 +++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template      |  1 -
 .../native/xpu/sycl/UnaryFractionKernels.cpp  | 21 +++++++++++
 .../native/xpu/sycl/UnaryFractionKernels.h    |  2 ++
 yaml/xpu_functions.yaml                       |  3 ++
 5 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index 3c7d8a1d6..4d07a466b 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -515,4 +515,40 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+TensorIterator ceil_meta(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs");
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(out, self);
+  return iter;
+}
+
+Tensor XPUNativeFunctions::ceil(const Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self.clone();
+  }
+  Tensor out;
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::ceil_(Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self;
+  }
+  auto iter = ceil_meta(self, self);
+  native::xpu::ceil_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    out.copy_(self);
+    return out;
+  }
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return out;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index f8b3db5de..66f7dd905 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -180,7 +180,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "bitwise_right_shift.Tensor_out",
     "cauchy_",
     "_cdist_backward",
-    "ceil.out",
     "channel_shuffle",
     "cholesky",
     "cholesky_inverse",
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
index b33be1a30..82bdc4c28 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
@@ -55,4 +55,25 @@ void reciprocal_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, ReciprocalFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct CeilFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::ceil(a);
+  }
+};
+
+template <typename T>
+struct CeilFunctor<c10::complex<T>> {
+  c10::complex<T> operator()(const c10::complex<T> a) const {
+    return c10::complex<T>(std::ceil(a.real()), std::ceil(a.imag()));
+  }
+};
+
+void ceil_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "ceil_xpu", [&]() {
+        gpu_kernel(iter, CeilFunctor<scalar_t>());
+      });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
index 5de711c78..7ab9baf32 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
@@ -6,4 +6,6 @@ namespace at::native::xpu {
 
 void reciprocal_kernel(TensorIteratorBase& iter);
 
+void ceil_kernel(TensorIteratorBase& iter);
+
 } // namespace at::native::xpu
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index d8cc5825b..04c027ca6 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -500,3 +500,6 @@ supported:
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_
+  - ceil
+  - ceil_
+  - ceil.out

From 1951fce0fc58eae0c3e1d491c85958b73c61575f Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Fri, 5 Jul 2024 14:55:21 +0800
Subject: [PATCH 11/20] Add aten::eye and its variant (#480)

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/TensorFactories.cpp  | 19 +++++++++++++++++++
 src/ATen/native/xpu/XPUFallback.template |  1 -
 test/xpu/xpu_test_utils.py               |  1 +
 yaml/xpu_functions.yaml                  |  2 ++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index ee29aa167..110590958 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -18,6 +18,25 @@
 
 namespace at {
 
+Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) {
+  return XPUNativeFunctions::eye_out(n, n, result);
+}
+
+Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  int64_t stride = result.stride(0) + result.stride(1);
+
+  Tensor diag = result.as_strided({sz}, {stride});
+  diag.fill_(1);
+  return result;
+}
+
 Tensor XPUNativeFunctions::empty(
     IntArrayRef size,
     c10::optional<ScalarType> dtype_opt,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 66f7dd905..496eb00f1 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -203,7 +203,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "exp2.out",
     "expm1.out",
     "exponential_",
-    "eye.m_out",
     "_fft_c2c",
     "_fft_c2r",
     "_fft_r2c",
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 6f296cbd0..6511f4120 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -19,6 +19,7 @@
 
 _xpu_computation_op_list = [
     "empty",
+    "eye",
     "fill",
     "zeros",
     "zeros_like",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 04c027ca6..2cd535394 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -190,6 +190,8 @@ supported:
   - exp_
   - empty.memory_format
   - empty_strided
+  - eye.out
+  - eye.m_out
   - _efficientzerotensor
   - complex.out
   - clone

From 0fe12f3d9fdbef8ca6553dc0cd7e6bfd309a4792 Mon Sep 17 00:00:00 2001
From: hjhee <hjhee@users.noreply.github.com>
Date: Mon, 8 Jul 2024 13:06:45 +0800
Subject: [PATCH 12/20] Add aten::conj_physical (#477)

- _conj_physical
  - conj_physical
  - conj_physical.out
  - conj_physical_

---------

Co-authored-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/UnaryOps.cpp              | 13 ++++++++++
 src/ATen/native/xpu/XPUFallback.template      |  1 -
 .../native/xpu/sycl/UnaryComplexKernels.cpp   | 26 +++++++++++++++++++
 .../native/xpu/sycl/UnaryComplexKernels.h     |  2 ++
 test/xpu/run_test_with_skip.py                | 10 +------
 test/xpu/xpu_test_utils.py                    |  1 +
 yaml/xpu_functions.yaml                       |  2 ++
 7 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index 4d07a466b..ffc528fab 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -5,6 +5,7 @@
 #include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/AbsKernel.h>
+#include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
 #include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h>
@@ -515,6 +516,18 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) {
+  auto iter = TensorIterator::unary_op(out, self);
+  native::xpu::conj_physical_kernel(iter);
+  return out;
+}
+
+Tensor& XPUNativeFunctions::conj_physical_(Tensor& self) {
+  if (!self.is_complex())
+    return self;
+  return XPUNativeFunctions::conj_physical_out(self, self);
+}
+
 TensorIterator ceil_meta(const Tensor& self, Tensor& out) {
   TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs");
   TensorIterator iter;
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 496eb00f1..0ece10206 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -184,7 +184,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "cholesky",
     "cholesky_inverse",
     "_cholesky_solve_helper",
-    "conj_physical.out",
     "copysign.out",
     "cosh.out",
     "count_nonzero.dim_IntList",
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
index e082096c1..87de57a3a 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
@@ -30,6 +30,32 @@ void conj_kernel(TensorIterator& iter) {
       }));
 }
 
+template <typename scalar_t>
+struct ConjPhysicalFunctor {
+  scalar_t operator()(scalar_t z) const {
+    return std::conj(z);
+  }
+};
+
+template <typename TYPE>
+struct ConjPhysicalFunctor<c10::complex<TYPE>> {
+  c10::complex<TYPE> operator()(c10::complex<TYPE> z) const {
+    return c10::complex<TYPE>(z.real(), -z.imag());
+  }
+};
+
+void conj_physical_kernel(TensorIterator& iter) {
+  AT_DISPATCH_SWITCH(
+      iter.common_dtype(),
+      "conj_xpu",
+      AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] {
+        // Conj is a no-op for non-complex types
+        copy_kernel(iter);
+      }) AT_DISPATCH_CASE_COMPLEX_TYPES_AND(kComplexHalf, [&] {
+        gpu_kernel(iter, ConjPhysicalFunctor<scalar_t>());
+      }));
+}
+
 template <typename scalar_t>
 struct NegConjScalarFunc {
   scalar_t operator()(scalar_t src_val) const {
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
index 8d19381b3..d3ad4fe15 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
@@ -6,6 +6,8 @@ namespace at::native::xpu {
 
 void conj_kernel(TensorIterator& iter);
 
+void conj_physical_kernel(TensorIterator& iter);
+
 void neg_conj_kernel(TensorIterator& iter);
 
 void neg_kernel(TensorIterator& iter);
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 9287a85b6..02e2542c8 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -207,7 +207,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_torch_fallback__refs_square_xpu_bool",
     "test_python_ref_torch_fallback__refs_vdot_xpu_complex128",
     "test_python_ref_torch_fallback__refs_vdot_xpu_complex64",
-    "test_variant_consistency_eager_conj_physical_xpu_complex64",
     "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64",
     "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32",
     "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
@@ -242,8 +241,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_executor__refs_square_executor_aten_xpu_complex128",
     "test_python_ref_torch_fallback__refs_square_xpu_complex128",
     "test_python_ref_torch_fallback__refs_square_xpu_complex64",
-    "test_conj_view_conj_physical_xpu_complex64",
-    "test_neg_conj_view_conj_physical_xpu_complex128",
     # Skip list of new added when porting XPU operators.
     # See: https://github.com/intel/torch-xpu-ops/issues/128
 
@@ -2207,9 +2204,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
     "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64",
     "test_forward_mode_AD_nn_functional_rrelu_xpu_float64",
-    # RuntimeError: DispatchStub: unsupported device typexpu
-    "test_inplace_forward_mode_AD_conj_physical_xpu_complex128",
-    # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
+# NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
     "test_fn_fwgrad_bwgrad_to_sparse_xpu_float64",
     "test_forward_mode_AD_to_sparse_xpu_float64",
 )
@@ -2745,9 +2740,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     ### Error #7 in TestBwdGradientsXPU , totally 2 , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [XPU, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].
     "test_fn_grad_to_sparse_xpu_float64",
     "test_fn_gradgrad_to_sparse_xpu_float64",
-    ### Error #8 in TestBwdGradientsXPU , totally 2 , RuntimeError: DispatchStub: unsupported device typexpu
-    "test_inplace_grad_conj_physical_xpu_complex128",
-    "test_inplace_gradgrad_conj_physical_xpu_complex128",
 )
 res += launch_test("test_ops_gradients_xpu.py", skip_list)
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 6511f4120..add5367fa 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -155,6 +155,7 @@
     "bincount",
     "renorm",
     "lerp",
+    "conj_physical",
 ]
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 2cd535394..10fd6748b 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -502,6 +502,8 @@ supported:
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_
+  - conj_physical.out
+  - conj_physical_
   - ceil
   - ceil_
   - ceil.out

From 441a56cf88e75e80dd85b184da138d477cd5089d Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Mon, 8 Jul 2024 13:13:01 +0800
Subject: [PATCH 13/20] Add aten::is_set_to (#516)

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/TensorProperties.cpp | 16 ++++++++++++++++
 yaml/xpu_functions.yaml                  |  1 +
 2 files changed, 17 insertions(+)
 create mode 100644 src/ATen/native/xpu/TensorProperties.cpp

diff --git a/src/ATen/native/xpu/TensorProperties.cpp b/src/ATen/native/xpu/TensorProperties.cpp
new file mode 100644
index 000000000..428d18fcd
--- /dev/null
+++ b/src/ATen/native/xpu/TensorProperties.cpp
@@ -0,0 +1,16 @@
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/is_set_to_native.h>
+#endif
+
+namespace at {
+
+bool XPUNativeFunctions::is_set_to(const Tensor& self, const Tensor& src) {
+  return at::native::is_set_to(self, src);
+}
+
+} // namespace at
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 10fd6748b..e76bfcd8e 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -455,6 +455,7 @@ supported:
   - _cdist_forward
   - _pin_memory
   - is_pinned
+  - is_set_to
   - bucketize.Tensor
   - bucketize.Tensor_out
   - bucketize.Scalar

From 682d0e4ff252cf9c69d04141fa4430092fbd5dd9 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Tue, 9 Jul 2024 08:34:10 +0800
Subject: [PATCH 14/20] Add aten::addcdiv and its variants (#486)

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/PointwiseOps.cpp          |  57 ++++++++++
 src/ATen/native/xpu/XPUFallback.template      |   1 -
 .../native/xpu/sycl/PointwiseOpsKernels.cpp   | 103 +++++++++++++++---
 .../native/xpu/sycl/PointwiseOpsKernels.h     |   2 +
 test/xpu/xpu_test_utils.py                    |   1 +
 yaml/xpu_functions.yaml                       |   3 +
 6 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp
index 210cec3e6..a01bdc391 100644
--- a/src/ATen/native/xpu/PointwiseOps.cpp
+++ b/src/ATen/native/xpu/PointwiseOps.cpp
@@ -6,6 +6,63 @@
 
 namespace at {
 
+TensorIterator addcdiv_meta(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value,
+    Tensor& out) {
+  if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) &&
+      isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) {
+    TORCH_CHECK(
+        false,
+        "Integer division with addcdiv is no longer supported, and in a future  ",
+        "release addcdiv will perform a true division of tensor1 and tensor2. ",
+        "The historic addcdiv behavior can be implemented as ",
+        "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ",
+        "for integer inputs and as ",
+        "(input + value * tensor1 / tensor2) for float inputs. ",
+        "The future addcdiv behavior is just the latter implementation: ",
+        "(input + value * tensor1 / tensor2), for all dtypes.");
+  }
+
+  TensorIterator iter;
+  iter.build_ternary_op(out, self, tensor1, tensor2);
+  return iter;
+}
+
+Tensor& XPUNativeFunctions::addcdiv_out(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value,
+    Tensor& out) {
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
+  native::xpu::addcdiv_kernel(iter, value);
+  return out;
+}
+
+Tensor XPUNativeFunctions::addcdiv(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value) {
+  Tensor out;
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
+  native::xpu::addcdiv_kernel(iter, value);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::addcdiv_(
+    Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value) {
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, self);
+  native::xpu::addcdiv_kernel(iter, value);
+  return self;
+}
+
 TensorIterator addcmul_meta(
     const Tensor& self,
     const Tensor& tensor1,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 0ece10206..fa7fafe13 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -163,7 +163,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "adaptive_max_pool2d.out",
     "adaptive_max_pool3d_backward.grad_input",
     "adaptive_max_pool3d.out",
-    "addcdiv.out",
     "aminmax.out",
     "angle",
     "argmin.out",
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 7b00d09e3..d38f511d7 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/OpMathType.h>
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
@@ -8,31 +8,98 @@
 namespace at::native::xpu {
 
 template <typename scalar_t>
-struct AddcmulKernelFunctor {
-  using opmath_t = at::opmath_type<scalar_t>;
+struct AddcmulFunctor {
+  using accscalar_t = at::acc_type<scalar_t, true>;
   scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
-    return static_cast<opmath_t>(a) +
-        alpha_ * static_cast<opmath_t>(b) * static_cast<opmath_t>(c);
+    return static_cast<accscalar_t>(a) +
+        alpha_ * static_cast<accscalar_t>(b) * static_cast<accscalar_t>(c);
   }
 
-  AddcmulKernelFunctor(opmath_t alpha) : alpha_(alpha) {}
+  AddcmulFunctor(accscalar_t alpha) : alpha_(alpha) {}
 
  private:
-  opmath_t alpha_;
+  accscalar_t alpha_;
+};
+
+template <typename scalar_t>
+struct AddcmulComplexFunctor {
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * b * c;
+  }
+
+  AddcmulComplexFunctor(scalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  scalar_t alpha_;
 };
 
 void addcmul_kernel(TensorIterator& iter, Scalar value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      iter.dtype(),
-      "addcmul_xpu",
-      [&]() {
-        using opmath_t = at::opmath_type<scalar_t>;
-        auto alpha = value.to<opmath_t>();
-        AddcmulKernelFunctor<scalar_t> f(alpha);
-        gpu_kernel(iter, f);
-      });
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_xpu", [&]() {
+      auto alpha = value.to<scalar_t>();
+      gpu_kernel(iter, AddcmulComplexFunctor<scalar_t>(alpha));
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        iter.dtype(),
+        "addcmul_xpu",
+        [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto alpha = value.to<accscalar_t>();
+          gpu_kernel(iter, AddcmulFunctor<scalar_t>(alpha));
+        });
+  }
+}
+
+template <typename scalar_t>
+struct AddcdivFunctor {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * (b / static_cast<accscalar_t>(c));
+  }
+
+  AddcdivFunctor(accscalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  accscalar_t alpha_;
+};
+
+template <typename scalar_t>
+struct AddcdivComplexFunctor {
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * (b / c);
+  }
+
+  AddcdivComplexFunctor(scalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  scalar_t alpha_;
+};
+
+void addcdiv_kernel(TensorIterator& iter, Scalar value) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_xpu", [&]() {
+      auto alpha = value.to<scalar_t>();
+      AddcdivComplexFunctor<scalar_t> f(alpha);
+      gpu_kernel(iter, f);
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        iter.dtype(),
+        "addcdiv_xpu",
+        [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto alpha = value.to<accscalar_t>();
+          AddcdivFunctor<scalar_t> f(alpha);
+          gpu_kernel(iter, f);
+        });
+  }
 }
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index fdb216dbd..c775b88e5 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -6,6 +6,8 @@ namespace at::native::xpu {
 
 void addcmul_kernel(TensorIterator& iter, Scalar value);
 
+void addcdiv_kernel(TensorIterator& iter, Scalar value);
+
 void mse_backward_kernel(TensorIterator& iter, const Scalar& value);
 
 } // namespace at::native::xpu
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index add5367fa..35c29d96b 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -44,6 +44,7 @@
     "bitwise_or",
     "bitwise_xor",
     "addcmul",
+    "addcdiv",
     "clamp",
     "clamp_max",
     "clamp_min",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index e76bfcd8e..2ecc6790b 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -497,6 +497,9 @@ supported:
   - avg_pool2d.out
   - avg_pool2d_backward
   - avg_pool2d_backward.grad_input
+  - addcdiv.out
+  - addcdiv
+  - addcdiv_
   - addcmul.out
   - addcmul
   - addcmul_

From 9df4ab4174fcb908924c7ee34dc82d1280bc5ff4 Mon Sep 17 00:00:00 2001
From: "Cheng, Penghui" <penghui.cheng@intel.com>
Date: Tue, 9 Jul 2024 08:56:53 +0800
Subject: [PATCH 15/20] Fixed col2im test error: The values for attribute
 'shape' do no match (#543)

Fixed col2im test case error: The values for attribute 'shape' do not
match

Signed-off-by: Cheng Penghui <penghui.cheng@intel.com>
---
 src/ATen/native/xpu/sycl/Col2ImKernel.cpp | 2 +-
 src/ATen/native/xpu/sycl/Im2ColKernel.cpp | 2 +-
 test/xpu/run_test_with_skip.py            | 3 ---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
index a0ba012c1..299711162 100644
--- a/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Col2ImKernel.cpp
@@ -200,7 +200,7 @@ void col2im_kernel(
   bool batched_input = true;
   if (input.dim() == 2) {
     batched_input = false;
-    input.resize_({1, input.size(0), input.size(1)});
+    input = input.view({1, input.size(0), input.size(1)});
   }
 
   auto batch_size = input.size(0);
diff --git a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
index 149665bc7..aa511e6df 100644
--- a/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
+++ b/src/ATen/native/xpu/sycl/Im2ColKernel.cpp
@@ -187,7 +187,7 @@ void im2col_kernel(
 
   if (input.dim() == 3) {
     batched_input = false;
-    input.resize_({1, input.size(0), input.size(1), input.size(2)});
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
   }
 
   auto batch_size = input.size(0);
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 02e2542c8..ea50fbc29 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1288,9 +1288,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build).
     "test_EmbeddingBag_sparse_cuda",
     "test_Embedding_sparse_cuda",
-    # col2im: AssertionError: The values for attribute 'shape' do not match: torch.Size([16, 4]) != torch.Size([1, 16, 4]).
-    "test_Fold_no_batch_dim_input_cuda",  # col2im
-    "test_Fold_no_batch_dim_int_input_cuda",
     # AssertionError: 'XPU error: device-side assert triggered' not found in '  File "<string>", line 8\n    def test_cross_entropy_loss_2d_out_of_bounds_class_index(self):\n    ^\nIndentationError: expected an indented block\n'
     "test_cross_entropy_loss_2d_out_of_bounds_class_index_xpu_float16",
     "test_cross_entropy_loss_2d_out_of_bounds_class_index_xpu_float32",

From 9046f60978fee672fffc387ffbfb75499b7bb6cc Mon Sep 17 00:00:00 2001
From: "Wang, Chuanqi" <chuanqi.wang@intel.com>
Date: Tue, 9 Jul 2024 10:20:40 +0800
Subject: [PATCH 16/20] Refine ci/cd both for ut and e2e (#541)

---
 .../actions/inductor-xpu-e2e-test/action.yml  |   6 +-
 .github/workflows/_linux_ut.yml               |  93 +++++----
 .github/workflows/inductor_xpu_e2e_ci.yml     | 137 -------------
 ...u_e2e_nightly.yml => nightly_ondemand.yml} | 191 +++++++-----------
 .github/workflows/pull.yml                    | 120 ++++++++++-
 5 files changed, 254 insertions(+), 293 deletions(-)
 delete mode 100644 .github/workflows/inductor_xpu_e2e_ci.yml
 rename .github/workflows/{inductor_xpu_e2e_nightly.yml => nightly_ondemand.yml} (68%)

diff --git a/.github/actions/inductor-xpu-e2e-test/action.yml b/.github/actions/inductor-xpu-e2e-test/action.yml
index 6e1dd4268..f4840f92b 100644
--- a/.github/actions/inductor-xpu-e2e-test/action.yml
+++ b/.github/actions/inductor-xpu-e2e-test/action.yml
@@ -41,7 +41,7 @@ runs:
       shell: bash
       run: |
         source activate e2e_ci
-        source /opt/intel/oneapi/compiler/latest/env/vars.sh
+        source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
         if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then
           cd ../ && rm -rf audio && git clone --single-branch -b main https://github.com/pytorch/audio.git
           cd audio && git checkout $TORCHAUDIO_COMMIT_ID
@@ -80,7 +80,7 @@ runs:
         source activate e2e_ci
         cp .github/scripts/inductor_xpu_test.sh ../pytorch
         cd ../pytorch
-        source /opt/intel/oneapi/compiler/latest/env/vars.sh
+        source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
         rm -f ${{ github.workspace }}/summary_accuracy.log
         # check param
         function contains() {
@@ -198,7 +198,7 @@ runs:
         source activate e2e_ci
         cp .github/scripts/inductor_perf_summary.py ../pytorch
         cd ../pytorch
-        source /opt/intel/oneapi/compiler/latest/env/vars.sh
+        source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
         pip install styleFrame scipy pandas
         set -xe
         for suite in $(echo ${{ inputs.suite }} |sed 's/,/ /g')
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 7cf2746c3..4abc9a95b 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -1,32 +1,37 @@
-name: inductor-xpu-ut-test
+name: Linux UT Test
 
 on:
   workflow_call:
     inputs:
-      torch_xpu_ops_update:
+      pytorch:
         required: false
         type: string
-        default: 'true'
-        description: True means update xpu_ops when building pytorch, otherwise means not
-      ut_suite:
+        default: 'main'
+        description: Pytorch branch/commit
+      keep_torch_xpu_ops:
+        required: false
+        type: string
+        default: 'false'
+        description: Keep torch-xpu-ops pin. `true` means use pined commit
+      ut:
         required: true
         type: string
-        default: 'op_example,op_extended,op_ut,torch_xpu'
-        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
-      pytorch_branch:
+        default: ''
+        description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma
+      python:
         required: false
         type: string
-        default: 'main'
-        description: Set pytorch branch
+        default: '3.10'
+        description: Python version
       runner:
         required: true
         type: string
         default: 'linux.idc.xpu'
-        description: Set runner
+        description: Runner label
 
 
 jobs:
-  Inductor-XPU-UT-Tests:
+  Torch-XPU-UT-Tests:
     runs-on: ${{ inputs.runner }} 
     timeout-minutes: 900
     steps:
@@ -36,60 +41,60 @@ jobs:
         run: |
           pwd
           cd ../ && rm -rf pytorch
-          git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch
-          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
-          if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then
-            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          git clone https://github.com/pytorch/pytorch pytorch
+          cd pytorch && git checkout ${{ inputs.pytorch }} 
+          # apply PRs for stock pytorch
+          pip install requests
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          if [[ ${{ inputs.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
           else
-            if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then
-              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-            else
-              echo "Not update torch-xpu-ops"
-            fi
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
           fi
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
       - name: Build Pytorch XPU
         run: |
           which conda && conda clean -ay
           conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
                 rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=${{ inputs.python }} cmake ninja -y
           source activate xpu_op_${ZE_AFFINITY_MASK}
           conda install -c intel mkl-static mkl-include -y
           cd ../pytorch
           pip install -r requirements.txt
           export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
           python setup.py bdist_wheel
           pip install --force-reinstall dist/*.whl
           git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
           pip install -r .ci/docker/requirements-ci.txt
       - name: Run XPU OP Examples
-        if: contains(inputs.ut_suite, 'op_example')
+        if: contains(inputs.ut, 'op_example') || github.event_name == 'schedule'
         run: |
           cd ${{ github.workspace }}
-          mkdir -p ut_log
           xpu-smi discovery
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ${{ github.workspace }}
           cd examples
           pip install pytest
           timeout 8000 pytest -v 
       - name: Run XPU OP Extended UT
-        if: contains(inputs.ut_suite, 'op_extended')
+        if: contains(inputs.ut, 'op_extended') || github.event_name == 'schedule'
         run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           source activate xpu_op_${ZE_AFFINITY_MASK}
           export PYTORCH_TEST_WITH_SLOW=1
           cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
           timeout 10000 python run_test_with_skip.py 
       - name: Run XPU OP UT
-        if: contains(inputs.ut_suite, 'op_ut')
+        if: contains(inputs.ut, 'op_ut') || github.event_name == 'schedule'
         run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           source activate xpu_op_${ZE_AFFINITY_MASK}
           export PYTORCH_ENABLE_XPU_FALLBACK=1
           export PYTORCH_TEST_WITH_SLOW=1
@@ -101,9 +106,9 @@ jobs:
           # test_foreach, test_decomp
           timeout 10000 python run_test_with_only.py 
       - name: Run Torch XPU UT
-        if: contains(inputs.ut_suite, 'torch_xpu')
+        if: contains(inputs.ut, 'torch_xpu') || github.event_name == 'schedule'
         run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           source activate xpu_op_${ZE_AFFINITY_MASK}
           cd ../pytorch
           TEST_REPORTS_DIR=$(pwd)/test/test-reports
@@ -117,7 +122,21 @@ jobs:
             fi
           done
           # Run Pytorch XPU python UT
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
-          python test/run_test.py --xpu 
+          export PYTORCH_TEST_WITH_SLOW=1
+          export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
 
+          test_cmd="python test/run_test.py --include "
+          # All Inductor UT under test/inductor
+          for test in $(ls test/inductor | grep test);
+          do 
+              test_cmd="${test_cmd} inductor/$test";
+          done
+          # All xpu ut under test/xpu
+          for test in $(ls test/xpu | grep test);
+          do 
+              test_cmd="${test_cmd} xpu/$test";
+          done
+          if [ -f "test/test_xpu.py" ]; then
+            test_cmd="${test_cmd} test_xpu.py"
+          fi
+          eval $test_cmd
diff --git a/.github/workflows/inductor_xpu_e2e_ci.yml b/.github/workflows/inductor_xpu_e2e_ci.yml
deleted file mode 100644
index c7d408b33..000000000
--- a/.github/workflows/inductor_xpu_e2e_ci.yml
+++ /dev/null
@@ -1,137 +0,0 @@
-name: E2E CI Tests
-
-on:
-  workflow_dispatch:
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      - converted_to_draft
-      - ready_for_review
-    branches:
-      - main
-      - release/*
-
-permissions: read-all
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  Inductor-XPU-E2E-CI-Tests:
-    runs-on: pvc_e2e
-    # Don't run on forked repos and draft PRs
-    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
-    timeout-minutes: 900
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v4
-      - name: Prepare Conda ENV
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=3.10 cmake ninja -y
-          source activate e2e_ci
-          conda install -c intel mkl-static mkl-include -y
-          pip install pandas scipy tqdm
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../ && rm -rf pytorch
-          source activate e2e_ci
-          git clone -b main https://github.com/pytorch/pytorch pytorch
-          cd pytorch
-          # apply PRs for stock pytorch
-          pip install requests
-          # https://github.com/mengfei25/pytorch/pull/18 internal use only for subset model list
-          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/18
-          git status && git show -s
-          git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-      - name: Triton Installation
-        run: |
-          source activate e2e_ci
-          cd ../pytorch
-          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-          TRITON_PINNED_COMMIT=$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)
-          echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
-          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
-      - name: Build Pytorch XPU
-        run: |
-          source activate e2e_ci
-          cd ../pytorch
-          pip install -r requirements.txt
-          export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
-          pip install --force-reinstall dist/*.whl
-      - name: Identify pinned versions
-        run: |
-          cd ../pytorch
-          echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
-          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
-          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
-      - name: Show GITHUB_ENV
-        run: |
-          echo "$GITHUB_ENV"
-          rm -rf ../pytorch/inductor_log
-          rm -rf /tmp/torchinductor_*
-      - name: Huggingface BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Huggingface FP16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: float16
-          mode: training
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Timm_models BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: timm_models
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Torchbench BF16 Training Accuracy Test
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: torchbench
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          env_prepare: true
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Summarize archieve files
-        if: always()
-        run: |
-          rm -rf ${{ github.workspace }}/upload_files
-          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
-          failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
-          if [ ${failed_case} -ne 0 ];then
-            grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
-            exit 1
-          fi
-      - name: Upload Inductor XPU E2E Data
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
-          path: ${{ github.workspace }}/upload_files
diff --git a/.github/workflows/inductor_xpu_e2e_nightly.yml b/.github/workflows/nightly_ondemand.yml
similarity index 68%
rename from .github/workflows/inductor_xpu_e2e_nightly.yml
rename to .github/workflows/nightly_ondemand.yml
index a8d316580..039407bc8 100644
--- a/.github/workflows/inductor_xpu_e2e_nightly.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -1,4 +1,4 @@
-name: E2E Nightly_OnDemand Tests
+name: Nightly-OnDemand Tests
 
 on:
   schedule:
@@ -6,57 +6,56 @@ on:
     - cron: '0 13 * * *'
   workflow_dispatch:
     inputs:
-      python:
+      pytorch:
         required: false
         type: string
-        default: '3.10'
-        description: Specify python version
+        default: 'main'
+        description: Pytorch branch/commit
+      keep_torch_xpu_ops:
+        required: false
+        type: string
+        default: 'false'
+        description: Keep torch-xpu-ops pin. `true` means use pined commit
+      ut:
+        required: true
+        type: string
+        default: 'torch_xpu'
+        description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma
       triton:
         required: false
         type: string
         default: ''
-        description: Specify triton commit, use pytorch pined commit by default
+        description: Triton commit. Use pytorch pined commit by default
       suite:
         required: true
         type: string
         default: 'huggingface'
-        description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma
+        description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench`. Delimiter is comma
       dt:
         required: true
         type: string
         default: 'float32'
-        description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma
+        description: Data precision of the test. `float32,bfloat16,float16,amp_bf16,amp_fp16`. Delimiter is comma
       mode:
         required: true
         type: string
         default: 'inference'
-        description: inference,training. Delimiter is comma
+        description: Test mode. `inference,training`. Delimiter is comma
       scenario:
         required: true
         type: string
         default: 'accuracy'
-        description: accuracy,performance. Delimiter is comma
+        description: Test scenario. `accuracy,performance`. Delimiter is comma
       model:
         required: false
         type: string
         default: ''
-        description: If set, will only launch this one
-      torch_xpu_ops_update:
-        required: false
-        type: string
-        default: 'true'
-        description: True means update xpu_ops when building pytorch, otherwise means not
-      ut_suite:
-        required: true
-        type: string
-        default: 'op_example,op_extended,op_ut,torch_xpu'
-        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
-      pytorch_branch:
+        description: Model. Will only run this one mode if set
+      python:
         required: false
         type: string
-        default: 'main'
-        description: Set pytorch branch
-
+        default: '3.10'
+        description: Python version
 
 permissions: read-all
 
@@ -65,11 +64,26 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  Inductor-XPU-E2E-Nightly-Tests:
+  Linux-Nightly-Ondemand-UT-Tests:
+    if: github.event_name == 'schedule' || ${{ inputs.ut_suite }}
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
+      ut: ${{ github.event_name == 'schedule' && 'op_example,op_extended,op_ut,torch_xpu' || inputs.ut }}
+      pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
+      runner: linux.idc.xpu
+
+  Linux-Nightly-Ondemand-E2E-Tests:
     runs-on: pvc_e2e
     # Don't run on forked repos
     if: github.repository_owner == 'intel'
     timeout-minutes: 900
+    env:
+      pytorch: ${{ github.event_name == 'schedule' && 'main' || inputs.pytorch }}
+      keep_torch_xpu_ops: ${{ github.event_name == 'schedule' && 'false' || inputs.keep_torch_xpu_ops }}
+      ut: ${{ github.event_name == 'schedule' && 'op_example,op_extended,op_ut,torch_xpu' || inputs.ut }}
+      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
     outputs:
       TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
       TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
@@ -80,7 +94,6 @@ jobs:
       TORCHBENCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCHBENCH_COMMIT_ID }}
       TORCHVISION_COMMIT_ID: ${{ steps.pinned.outputs.TORCHVISION_COMMIT_ID }}
       TORCHAUDIO_COMMIT_ID: ${{ steps.pinned.outputs.TORCHAUDIO_COMMIT_ID }}
-      # TORCHTEXT_COMMIT_ID: ${{ steps.pinned.outputs.TORCHTEXT_COMMIT_ID }}
       TRANSFORMERS_VERSION: ${{ steps.pinned.outputs.TRANSFORMERS_VERSION }}
       TIMM_COMMIT_ID: ${{ steps.pinned.outputs.TIMM_COMMIT_ID }}
       TRITON_COMMIT_ID: ${{ steps.pinned.outputs.TRITON_COMMIT_ID }}
@@ -91,7 +104,7 @@ jobs:
         run: |
           which conda && conda clean -ay
           conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
-          conda create -n e2e_ci python=${{ inputs.python }} cmake ninja -y
+          conda create -n e2e_ci python=${{ env.python }} cmake ninja -y
           source activate e2e_ci
           conda install -c intel mkl-static mkl-include -y
           pip install pandas scipy tqdm
@@ -100,16 +113,20 @@ jobs:
           pwd
           cd ../ && rm -rf pytorch
           source activate e2e_ci
-          git clone -b main https://github.com/pytorch/pytorch pytorch
-          cd pytorch
+          git clone https://github.com/pytorch/pytorch pytorch
+          cd pytorch && git checkout ${{ env.pytorch }} 
           # apply PRs for stock pytorch
           pip install requests
           python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
           git status && git show -s
           git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+          if [[ ${{ env.keep_torch_xpu_ops }} == 'true' ]]; then
+            echo "Don't replace torch-xpu-ops!"
+          else
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            # Workaround for torch-xpu-ops ci test
+            sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+          fi
       - name: Identify pinned versions
         id: pinned
         run: |
@@ -128,7 +145,7 @@ jobs:
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           . /etc/os-release
@@ -148,7 +165,7 @@ jobs:
           cd ../pytorch
           pip install -r requirements.txt
           export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
           python setup.py bdist_wheel
           pip install --force-reinstall dist/*.whl
@@ -157,63 +174,18 @@ jobs:
           echo "$GITHUB_ENV"
           rm -rf ../pytorch/inductor_log
           rm -rf /tmp/torchinductor_*
-      - name: Nightly Huggingface FP32 Inference Accuracy Test
-        if: ${{ !inputs.suite }}
+      - name: Nightly Huggingface FP32/BF16/FP16 Inference & Training Accuracy Test
+        if: github.event_name == 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: huggingface
           env_prepare: true
-          dt: float32
-          mode: inference
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Huggingface BF16 Inference Accuracy Test
-        if: ${{ !inputs.suite }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: bfloat16
-          mode: inference
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Huggingface FP16 Inference Accuracy Test
-        if: ${{ !inputs.suite }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: float16
-          mode: inference
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Huggingface FP32 Training Accuracy Test
-        if: ${{ !inputs.suite }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: float32
-          mode: training
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Huggingface BF16 Training Accuracy Test
-        if: ${{ !inputs.suite }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: bfloat16
-          mode: training
-          scenario: accuracy
-          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-      - name: Nightly Huggingface FP16 Training Accuracy Test
-        if: ${{ !inputs.suite }}
-        uses: ./.github/actions/inductor-xpu-e2e-test
-        with:
-          suite: huggingface
-          dt: float16
-          mode: training
+          dt: float32,bfloat16,float16
+          mode: inference,traning
           scenario: accuracy
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Torchbench BF16 Training Accuracy Test
-        if: ${{ !inputs.suite }}
+        if: github.event_name == 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: torchbench
@@ -223,7 +195,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: Nightly Timm_models FP16 Training Accuracy Test
-        if: ${{ !inputs.suite }}
+        if: github.event_name == 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: timm_models
@@ -233,7 +205,7 @@ jobs:
           env_prepare: true
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       - name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
-        if: ${{ inputs.suite }}
+        if: github.event_name != 'schedule'
         uses: ./.github/actions/inductor-xpu-e2e-test
         with:
           suite: ${{ inputs.suite }}
@@ -258,15 +230,6 @@ jobs:
         with:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
-
-  Inductor-XPU-UT-Nightly-Tests:
-    if: ${{ inputs.ut_suite }}
-    name: Nightly Inductor XPU UT Test
-    uses: ./.github/workflows/_linux_ut.yml
-    with: 
-      ut_suite: ${{ inputs.ut_suite }}
-      pytorch_branch: ${{ inputs.pytorch_branch }}
-      runner: linux.idc.xpu
       
   Tests-Failure-And-Report:
     if: always()
@@ -275,7 +238,8 @@ jobs:
       issues: write
     env:
       GH_TOKEN: ${{ github.token }}
-    needs: Inductor-XPU-E2E-Nightly-Tests
+      python: ${{ github.event_name == 'schedule' && '3.10' || inputs.python }}
+    needs: Linux-Nightly-Ondemand-E2E-Tests
     steps:
       - name: Report github issue for XPU OPS nightly
         if: github.repository_owner == 'intel'
@@ -284,23 +248,23 @@ jobs:
           # Test env
           build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
           repo="${{ github.repository }}"
-          TORCH_BRANCH_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCH_BRANCH_ID }}"
-          TORCH_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCH_COMMIT_ID }}"
-          DRIVER_VERSION="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.DRIVER_VERSION }}"
-          BUNDLE_VERSION="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.BUNDLE_VERSION }}"
-          OS_PRETTY_NAME="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.OS_PRETTY_NAME }}"
-          GCC_VERSION="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.GCC_VERSION }}"
-          TORCHBENCH_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
-          TORCHVISION_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCHVISION_COMMIT_ID }}"
-          TORCHAUDIO_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
-          # TORCHTEXT_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCHTEXT_COMMIT_ID }}"
-          TRANSFORMERS_VERSION="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TRANSFORMERS_VERSION }}"
-          TIMM_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TIMM_COMMIT_ID }}"
-          TRITON_COMMIT_ID="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TRITON_COMMIT_ID }}"
+          TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}"
+          TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}"
+          DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}"
+          BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}"
+          OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}"
+          GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}"
+          TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
+          TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}"
+          TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
+          # TORCHTEXT_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHTEXT_COMMIT_ID }}"
+          TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}"
+          TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}"
+          TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}"
           # Test status
-          if [ "${{ needs.Inductor-XPU-E2E-Nightly-Tests.result }}" == "success" ];then
+          if [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "success" ];then
             test_status=Success
-          elif [ "${{ needs.Inductor-XPU-E2E-Nightly-Tests.result }}" == "failure" ];then
+          elif [ "${{ needs.Linux-Nightly-Ondemand-E2E-Tests.result }}" == "failure" ];then
             test_status=Failure
             cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
           else
@@ -317,7 +281,7 @@ jobs:
             test_issue_id=432
           fi
           # Test report
-          echo -e "$cc_comment\n**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
+          echo -e "**${test_status}** $test_type Test on $(date +'%F'), See: $build_url\n" > ${{ github.workspace }}/report.txt
           printf "Torch-xpu-ops | PyTorch | Triton\n--- | --- | ---\n${GITHUB_WORKFLOW_SHA:0:7} on ${GITHUB_REF_NAME} | " >> ${{ github.workspace }}/report.txt
           printf "[${TORCH_COMMIT_ID:0:7}](https://github.com/pytorch/pytorch/commit/${TORCH_COMMIT_ID:0:7}) on $TORCH_BRANCH_ID | " >> ${{ github.workspace }}/report.txt
           echo -e "[${TRITON_COMMIT_ID:0:7}](https://github.com/intel/intel-xpu-backend-for-triton/commit/${TRITON_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
@@ -328,7 +292,7 @@ jobs:
           printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt
           echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt
           printf "Device | OS | GCC | Python | Driver(DKMS) | Bundle(DPCPP)\n--- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt
-          echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ inputs.python }} | $DRIVER_VERSION| $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
+          echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION| $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt
           if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
             test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}"
             if [ "${{ inputs.triton }}" != "" ];then
@@ -339,6 +303,7 @@ jobs:
             fi
             echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
           fi
+          echo "$cc_comment\n" >> ${{ github.workspace }}/report.txt
           # Report
           report_txt=$(cat ${{ github.workspace }}/report.txt)
           gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 1bd635d1a..3b103e4d9 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -22,9 +22,123 @@ jobs:
   preci-ut:
     # Don't run on forked repos and draft PRs
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
-    name: preci-ut
     uses: ./.github/workflows/_linux_ut.yml
     with: 
-      ut_suite: op_example,op_extended,op_ut,torch_xpu
+      ut: op_example,op_extended,op_ut
       runner: linux.idc.xpu
-      
\ No newline at end of file
+
+  Inductor-XPU-E2E-CI-Tests:
+    runs-on: pvc_e2e
+    # Don't run on forked repos and draft PRs
+    if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
+    timeout-minutes: 900
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Conda ENV
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
+          conda create -n e2e_ci python=3.10 cmake ninja -y
+          source activate e2e_ci
+          conda install -c intel mkl-static mkl-include -y
+          pip install pandas scipy tqdm
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          cd ../ && rm -rf pytorch
+          source activate e2e_ci
+          git clone -b main https://github.com/pytorch/pytorch pytorch
+          cd pytorch
+          # apply PRs for stock pytorch
+          pip install requests
+          # https://github.com/mengfei25/pytorch/pull/18 internal use only for subset model list
+          python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/mengfei25/pytorch/pull/18
+          git status && git show -s
+          git submodule sync && git submodule update --init --recursive
+          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          # Workaround for torch-xpu-ops ci test
+          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+      - name: Triton Installation
+        run: |
+          source activate e2e_ci
+          cd ../pytorch
+          TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
+          TRITON_PINNED_COMMIT=$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)
+          echo ${TRITON_REPO}@${TRITON_PINNED_COMMIT}
+          pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+      - name: Build Pytorch XPU
+        run: |
+          source activate e2e_ci
+          cd ../pytorch
+          pip install -r requirements.txt
+          export USE_XPU=1
+          source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+      - name: Identify pinned versions
+        run: |
+          cd ../pytorch
+          echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
+          echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
+          echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
+      - name: Show GITHUB_ENV
+        run: |
+          echo "$GITHUB_ENV"
+          rm -rf ../pytorch/inductor_log
+          rm -rf /tmp/torchinductor_*
+      - name: Huggingface BF16 Training Accuracy Test
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: huggingface
+          dt: bfloat16
+          mode: training
+          scenario: accuracy
+          env_prepare: true
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Huggingface FP16 Training Accuracy Test
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: huggingface
+          dt: float16
+          mode: training
+          scenario: accuracy
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Timm_models BF16 Training Accuracy Test
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: timm_models
+          dt: bfloat16
+          mode: training
+          scenario: accuracy
+          env_prepare: true
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Torchbench BF16 Training Accuracy Test
+        uses: ./.github/actions/inductor-xpu-e2e-test
+        with:
+          suite: torchbench
+          dt: bfloat16
+          mode: training
+          scenario: accuracy
+          env_prepare: true
+          hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Summarize archieve files
+        if: always()
+        run: |
+          rm -rf ${{ github.workspace }}/upload_files
+          cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
+          failed_case=$(grep "Real failed: models: *[1-9]" ${{ github.workspace }}/upload_files/summary_accuracy.log |wc -l || true)
+          if [ ${failed_case} -ne 0 ];then
+            grep -E "Real failed: models: [1-9]|Summary for" ${{ github.workspace }}/summary_accuracy.log
+            exit 1
+          fi
+      - name: Upload Inductor XPU E2E Data
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
+          path: ${{ github.workspace }}/upload_files

From 4db0b0cd1ca51d9cfd890be2eb3527b165782220 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Tue, 9 Jul 2024 22:05:35 +0800
Subject: [PATCH 17/20] [PyTorch uplift breaks XPU] Follow utils usage of
 latest PyTorch definition (#545)

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/sycl/IndexingUtils.h |  2 +-
 test/xpu/extended/run_test_with_skip.py  | 12 ++++++++++++
 test/xpu/run_test_with_skip.py           | 16 +++++++---------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/IndexingUtils.h b/src/ATen/native/xpu/sycl/IndexingUtils.h
index 1c6d9c373..26eb2f1ea 100644
--- a/src/ATen/native/xpu/sycl/IndexingUtils.h
+++ b/src/ATen/native/xpu/sycl/IndexingUtils.h
@@ -99,7 +99,7 @@ static std::tuple<Tensor, int64_t, int64_t, int64_t> computeLinearIndex(
 static std::
     tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>>
     makeLinearIndex(Tensor self, IOptTensorListRef orig, bool check_range) {
-  checkIndexTensorTypes(orig, /*allow_int*/ true);
+  checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more
   // LongTensors
   auto indices = expandTensors(self, orig);
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index c7c2ff404..943d46465 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -128,6 +128,18 @@
     # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed)
     # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
     "test_compare_cpu_hypot_xpu_bfloat16",
+
+    # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
+    # https://github.com/intel/torch-xpu-ops/issues/549
+    # Example fail log
+    # FAILED test_ops_xpu.py::TestCommonXPU::test_compare_cpu_nn_functional_batch_norm_xpu_float16 - AssertionError: Tensor-likes are not close!
+    # Mismatched elements: 3 / 72 (4.2%)
+    # Greatest absolute difference: 0.0029296875 at index (0, 1, 1, 0) (up to 0.001 allowed)
+    # Greatest relative difference: 0.0032501220703125 at index (2, 1, 2, 1) (up to 0.001 allowed)
+    "test_compare_cpu_nn_functional_batch_norm_xpu_float16",
+    "test_compare_cpu_std_mean_xpu_bfloat16",
+    "test_compare_cpu_sub_xpu_float16",
+    "test_compare_cpu_var_mean_xpu_bfloat16",
 )
 
 
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index ea50fbc29..9e8b02d3c 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -2995,23 +2995,21 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 res += launch_test("nn/test_convolution_xpu.py", skip_list)
 
 # test_dynamic_shapes
-
-
-res += launch_test("test_dynamic_shapes_xpu.py")
+skip_list = (
+    # Regression after PyTorch uplift
+    # https://github.com/intel/torch-xpu-ops/issues/549
+    # AssertionError: 3 != 3.0
+    "test_symnode_hashing",
+)
+res += launch_test("test_dynamic_shapes_xpu.py", skip_list)
 
 # test_load_state_dict
-
-
 res += launch_test("nn/test_load_state_dict_xpu.py")
 
 # test_module_hooks
-
-
 res += launch_test("nn/test_module_hooks_xpu.py")
 
 # test_parametrization
-
-
 res += launch_test("nn/test_parametrization_xpu.py")
 
 exit_code = os.WEXITSTATUS(res)

From 1dcaf3eaa27aaf893b7630483ce97c81e1364fcb Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Wed, 10 Jul 2024 13:15:17 +0800
Subject: [PATCH 18/20] build: fix -Wunused-function warning (#554)

PyTorch enforced unused-function and unused-variable compilation options
recently. It leads to torch-xpu-ops building failure with -Werror.
Fixing: https://github.com/pytorch/pytorch/pull/130084

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 src/ATen/native/xpu/XPUFallback.template | 4 ----
 test/xpu/run_test_with_skip.py           | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index fa7fafe13..7bfdd6abd 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -109,10 +109,6 @@ static void xpu_lazy_registration_or_error_fallback(
   }
 }
 
-static void xpu_force_fallback(
-    const c10::OperatorHandle& op,
-    torch::jit::Stack* stack) {}
-
 TORCH_LIBRARY_IMPL(_, XPU, m) {
   static const char* enable_xpu_fallback =
       getenv("PYTORCH_ENABLE_XPU_FALLBACK");
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index 9e8b02d3c..28a073bd4 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1380,6 +1380,11 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # https://github.com/intel/torch-xpu-ops/issues/461
     "test_index_put_src_datatype_xpu_float8_e5m2",
     "test_index_put_src_datatype_xpu_float8_e4m3fn",
+
+    # Regression after PyTorch update
+    # http://github.com/intel/torch-xpu-ops/issues/549
+    # IndexError: tensors used as indices must be long, byte or bool tensors.
+    "test_index_ind_dtype_xpu",
 )
 res += launch_test("test_indexing_xpu.py", skip_list)
 

From d294ebd1ebf0ee9ad4d792a7d61962f2ff8a247f Mon Sep 17 00:00:00 2001
From: "Wang, Chuanqi" <chuanqi.wang@intel.com>
Date: Thu, 11 Jul 2024 09:21:13 +0800
Subject: [PATCH 19/20] [CI] enable WERROR build for ut test (#560)

---
 .github/ci_commit_pins/torchbench.txt  | 1 +
 .github/workflows/_linux_ut.yml        | 2 +-
 .github/workflows/nightly_ondemand.yml | 8 +++-----
 .github/workflows/pull.yml             | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)
 create mode 100644 .github/ci_commit_pins/torchbench.txt

diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt
new file mode 100644
index 000000000..ac2a7c011
--- /dev/null
+++ b/.github/ci_commit_pins/torchbench.txt
@@ -0,0 +1 @@
+bb5294090a397b15fadf10cd2172f9bd9c461f9a
diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
index 4abc9a95b..0440d202b 100644
--- a/.github/workflows/_linux_ut.yml
+++ b/.github/workflows/_linux_ut.yml
@@ -68,7 +68,7 @@ jobs:
           export USE_XPU=1
           source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
           export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
+          WERROR=1 python setup.py bdist_wheel
           pip install --force-reinstall dist/*.whl
           git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
           pip install -r .ci/docker/requirements-ci.txt
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
index 039407bc8..926b099f3 100644
--- a/.github/workflows/nightly_ondemand.yml
+++ b/.github/workflows/nightly_ondemand.yml
@@ -17,10 +17,10 @@ on:
         default: 'false'
         description: Keep torch-xpu-ops pin. `true` means use pined commit
       ut:
-        required: true
+        required: false
         type: string
         default: 'torch_xpu'
-        description: UT scope. `op_example,op_extended,op_ut,torch_xpu` Delimiter is comma
+        description: UT scope. `op_example,op_extended,op_ut,torch_xpu`. Delimiter is comma
       triton:
         required: false
         type: string
@@ -141,7 +141,6 @@ jobs:
           echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
-          # echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
           echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
@@ -257,7 +256,6 @@ jobs:
           TORCHBENCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHBENCH_COMMIT_ID }}"
           TORCHVISION_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHVISION_COMMIT_ID }}"
           TORCHAUDIO_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHAUDIO_COMMIT_ID }}"
-          # TORCHTEXT_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCHTEXT_COMMIT_ID }}"
           TRANSFORMERS_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRANSFORMERS_VERSION }}"
           TIMM_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TIMM_COMMIT_ID }}"
           TRITON_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TRITON_COMMIT_ID }}"
@@ -303,7 +301,7 @@ jobs:
             fi
             echo -e "Inputs | $test_scope\n--- | --- \n" >> ${{ github.workspace }}/report.txt
           fi
-          echo "$cc_comment\n" >> ${{ github.workspace }}/report.txt
+          echo "$cc_comment" >> ${{ github.workspace }}/report.txt
           # Report
           report_txt=$(cat ${{ github.workspace }}/report.txt)
           gh --repo $repo issue comment $test_issue_id --body "$report_txt"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3b103e4d9..5ee55fdcf 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -82,7 +82,7 @@ jobs:
           cd ../pytorch
           echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
           echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
-          echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" >> "${GITHUB_ENV}"
+          echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
           echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
           echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
           echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"

From 0253fb9021ada9c8df124b0c12d8f4aded4fb601 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Thu, 11 Jul 2024 10:44:20 +0800
Subject: [PATCH 20/20] =?UTF-8?q?Using=20kernel=20specific=20max=20work=20?=
 =?UTF-8?q?group=20size=20instead=20of=20device=20max=20work=20=E2=80=A6?=
 =?UTF-8?q?=20(#542)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…group size.

Max work group size of kernel is not a static and device related only
property. It now in SYCL depends on driver/compiler implementation.
Device max work group means the probable max work group size allowd by
the device. But actual max work group size depends on driver/compiler
implementation, like compilaton optimization. Using kernel specific max
work group size could get actual max work group allowed correctly. For
example, on Xe, if compiler chooses SIMD16 and large GRF (32 HW threads
per SS), the actual max work group size will be 512 (16 * 32), not 1024
queried by device::info::max_work_group_size.

---------

Signed-off-by: Feng Yuan <feng1.yuan@intel.com>
---
 .../native/xpu/sycl/ActivationGluKernels.cpp  |   8 +-
 .../sycl/AdaptiveAveragePooling2dKernels.cpp  |   2 +-
 src/ATen/native/xpu/sycl/BatchKernel.h        |  15 +-
 src/ATen/native/xpu/sycl/BatchNormKernels.cpp | 274 ++++++++++++------
 .../native/xpu/sycl/BucketizationKernels.cpp  |  36 +--
 src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp |  60 ++--
 .../native/xpu/sycl/EmbeddingBackwardKernel.h |  80 ++---
 src/ATen/native/xpu/sycl/EmbeddingBag.cpp     |  23 +-
 .../native/xpu/sycl/ForeachReduceKernels.cpp  |  67 ++++-
 src/ATen/native/xpu/sycl/GridSampler.cpp      |  18 +-
 src/ATen/native/xpu/sycl/GroupNormKernels.cpp |  36 ++-
 src/ATen/native/xpu/sycl/GroupReduceUtils.h   |   6 +-
 src/ATen/native/xpu/sycl/Indexing.cpp         |  53 +++-
 src/ATen/native/xpu/sycl/Indexing.h           |  33 ++-
 src/ATen/native/xpu/sycl/LossNLLKernel.cpp    |  19 +-
 src/ATen/native/xpu/sycl/MultiTensorApply.h   |  37 ++-
 src/ATen/native/xpu/sycl/NonzeroKernel.cpp    |   6 +-
 src/ATen/native/xpu/sycl/Norm.h               |  28 +-
 src/ATen/native/xpu/sycl/RandpermKernel.cpp   |   3 +-
 src/ATen/native/xpu/sycl/Reduce.h             |  49 ++--
 src/ATen/native/xpu/sycl/ScanUtils.h          |  55 ++--
 src/ATen/native/xpu/sycl/Shape.cpp            |  21 +-
 src/ATen/native/xpu/sycl/SoftMaxKernels.cpp   | 246 +++++++++++-----
 .../xpu/sycl/TensorTransformationsKernels.cpp |  16 +-
 .../native/xpu/sycl/TriangularOpsKernels.cpp  |  12 +-
 .../xpu/sycl/UpSampleBicubic2dKernels.cpp     |   9 +-
 .../xpu/sycl/UpSampleBilinear2dKernels.cpp    |  17 +-
 src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h |  30 +-
 src/comm/DeviceProperties.h                   |  25 +-
 29 files changed, 824 insertions(+), 460 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
index b3606ddef..056e8c332 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -97,13 +97,13 @@ void launch_glu_backward_kernel(
     OffsetCalc offset_calculator,
     int64_t gI_byte_offset,
     int64_t I_byte_offset) {
-  const int64_t local_size = syclMaxWorkGroupSize();
-  const int64_t num_wg = (numel + local_size - 1) / local_size;
-  const int64_t global_size = num_wg * local_size;
-
   GluBackwardKernelFunctor<scalar_t, OffsetCalc> kfn(
       numel, gI, I, gO, offset_calculator, gI_byte_offset, I_byte_offset);
 
+  const int64_t local_size = syclMaxWorkGroupSize(kfn);
+  const int64_t num_wg = (numel + local_size - 1) / local_size;
+  const int64_t global_size = num_wg * local_size;
+
   sycl_kernel_submit(global_size, local_size, getCurrentSYCLQueue(), kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
index aacc66062..4d7ef286d 100644
--- a/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.cpp
@@ -194,7 +194,7 @@ struct AdaptiveAvgPool2dBwdSLMKernelFunctor
     numel_ = ib_ * ic_ * ih_ * iw_;
     int total_item = std::min(numel_, syclMaxWorkItemsPerTile());
 
-    local_range_ = syclMaxWorkGroupSize();
+    local_range_ = syclMaxWorkGroupSize(*this);
     global_range_ = total_item < local_range_
         ? local_range_
         : (total_item / local_range_) * local_range_;
diff --git a/src/ATen/native/xpu/sycl/BatchKernel.h b/src/ATen/native/xpu/sycl/BatchKernel.h
index fef42b938..cff967a76 100644
--- a/src/ATen/native/xpu/sycl/BatchKernel.h
+++ b/src/ATen/native/xpu/sycl/BatchKernel.h
@@ -39,6 +39,7 @@ class BatchKernelConfig {
         problem_batch_(problem_batch),
         problem_along_x_(problem_along_x),
         policy_(policy_combine(policies)),
+        prefer_wg_size_(prefer_wg_size),
         problem_wg_range_(0),
         problem_glb_range_(0),
         problem_range_(0),
@@ -47,12 +48,15 @@ class BatchKernelConfig {
         glb_range_x_(0),
         glb_range_y_(0),
         wg_range_x_(0),
-        wg_range_y_(0) {
-    size_t wg_size = syclMaxWorkGroupSize();
+        wg_range_y_(0) {}
+
+  template <class KernelClass>
+  void build() {
+    size_t wg_size = syclMaxWorkGroupSize<KernelClass>();
     size_t sg_size = syclMaxSubGroupSize();
-    if (prefer_wg_size != 0 && prefer_wg_size % sg_size == 0 &&
-        prefer_wg_size < wg_size) {
-      wg_size = prefer_wg_size;
+    if (prefer_wg_size_ != 0 && prefer_wg_size_ % sg_size == 0 &&
+        prefer_wg_size_ < wg_size) {
+      wg_size = prefer_wg_size_;
     }
     wg_range_x_ = sg_size;
     wg_range_y_ = wg_size / wg_range_x_;
@@ -263,6 +267,7 @@ class BatchKernelConfig {
   /* logical active batch */ int64_t problem_batch_;
   bool problem_along_x_;
   Policy policy_;
+  size_t prefer_wg_size_;
   int64_t problem_wg_range_;
   int64_t problem_glb_range_;
   size_t problem_range_;
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 5c64816fb..f0ab4df47 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -133,16 +133,39 @@ struct Var {
   }
 };
 
+template <class KernelClass>
 int get_max_group_size(int simd = SIMD32) {
   // The max work group size required by batch_norm needs to ensure that the two
   // subgroup reduces can obtain correct results.
-  int max_size = syclMaxWorkGroupSize();
+  int max_size = syclMaxWorkGroupSize<KernelClass>();
   int shfl2_restricted_size = simd * simd;
   return max_size > shfl2_restricted_size ? shfl2_restricted_size : max_size;
 }
 
+template <class KernelClass>
 int get_num_threads(int nelem, int restricted_simd = SIMD32) {
-  int max_size = get_max_group_size(restricted_simd);
+  int max_size = get_max_group_size<KernelClass>(restricted_simd);
+  int thread_sizes[5] = {32, 64, 128, 256, max_size};
+  for (int i = 0; i < 5; ++i) {
+    if (nelem <= thread_sizes[i]) {
+      return thread_sizes[i];
+    }
+  }
+  return max_size;
+}
+
+int get_dev_max_group_size(int simd = SIMD32) {
+  // The max work group size required by batch_norm needs to ensure that the two
+  // subgroup reduces can obtain correct results.
+  int max_size = syclDeviceMaxWorkGroupSize();
+  int shfl2_restricted_size = simd * simd;
+  return max_size > shfl2_restricted_size ? shfl2_restricted_size : max_size;
+}
+
+int get_num_threads_by_dev_max_group_size(
+    int nelem,
+    int restricted_simd = SIMD32) {
+  int max_size = get_dev_max_group_size(restricted_simd);
   int thread_sizes[5] = {32, 64, 128, 256, max_size};
   for (int i = 0; i < 5; ++i) {
     if (nelem <= thread_sizes[i]) {
@@ -565,39 +588,53 @@ void batch_norm_stats_template(
 
   auto& queue = getCurrentSYCLQueue();
   int simd = get_prefer_simd(input.size(1), input.size(0) * input.size(2));
-  int max_group_size = get_max_group_size(simd);
-  int tf = get_num_threads(input.size(2), simd);
-  int64_t work_group_size_x = tf;
-  int64_t work_group_size_y = std::max(1, max_group_size / tf);
-  int64_t global_size_x = input.size(1) * work_group_size_x;
-  int64_t global_size_y = 1 * work_group_size_y;
 
   if (simd == SIMD32) {
-    auto caller = BatchNormCollectStatisticsKernelFunctor<
+    using KernelClass = BatchNormCollectStatisticsKernelFunctor<
         SIMD32,
         VarTransform,
         scalar_t,
         scalar_t,
         accscalar_t,
-        index_t>(input, epsilon, 0.0, mean, invstd);
+        index_t>;
+
+    auto kfn = KernelClass(input, epsilon, 0.0, mean, invstd);
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int64_t work_group_size_x = tf;
+    int64_t work_group_size_y = std::max(1, max_group_size / tf);
+    int64_t global_size_x = input.size(1) * work_group_size_x;
+    int64_t global_size_y = 1 * work_group_size_y;
+
     sycl_kernel_submit(
         sycl::range<2>(global_size_y, global_size_x),
         sycl::range<2>(work_group_size_y, work_group_size_x),
         queue,
-        caller);
+        kfn);
   } else {
-    auto caller = BatchNormCollectStatisticsKernelFunctor<
+    using KernelClass = BatchNormCollectStatisticsKernelFunctor<
         SIMD16,
         VarTransform,
         scalar_t,
         scalar_t,
         accscalar_t,
-        index_t>(input, epsilon, 0.0, mean, invstd);
+        index_t>;
+
+    auto kfn = KernelClass(input, epsilon, 0.0, mean, invstd);
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int64_t work_group_size_x = tf;
+    int64_t work_group_size_y = std::max(1, max_group_size / tf);
+    int64_t global_size_x = input.size(1) * work_group_size_x;
+    int64_t global_size_y = 1 * work_group_size_y;
+
     sycl_kernel_submit(
         sycl::range<2>(global_size_y, global_size_x),
         sycl::range<2>(work_group_size_y, work_group_size_x),
         queue,
-        caller);
+        kfn);
   }
 }
 
@@ -960,10 +997,11 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    size_t max_wg_sz = syclMaxWorkGroupSize();
-    shmem_mean_ = sycl_local_acc_t<accscalar_t>(sycl::range<1>{max_wg_sz}, cgh);
-    shmem_m2n_ = sycl_local_acc_t<accscalar_t>(sycl::range<1>{max_wg_sz}, cgh);
-    shmem_count_ = sycl_local_acc_t<int>(sycl::range<1>{max_wg_sz}, cgh);
+    shmem_mean_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_m2n_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_count_ = sycl_local_acc_t<int>(sycl::range<1>{(size_t)wg_size_}, cgh);
     is_last_group_done_ = sycl_local_acc_t<bool>(sycl::range<1>{1}, cgh);
   }
 
@@ -975,7 +1013,8 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
       int* semaphores,
       const int reduction_size,
       const int stride,
-      accscalar_t epsilon)
+      accscalar_t epsilon,
+      int wg_size)
       : input_(input),
         out_mean_(out_mean),
         out_invstd_(out_invstd),
@@ -983,7 +1022,8 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
         semaphores_(semaphores),
         reduction_size_(reduction_size),
         stride_(stride),
-        epsilon_(epsilon) {}
+        epsilon_(epsilon),
+        wg_size_(wg_size) {}
 
  private:
   const scalar_t* __restrict__ input_;
@@ -994,6 +1034,7 @@ struct BatchNormCollectStatisticsChannelsLastKernelFunctor
   const int reduction_size_;
   const int stride_;
   accscalar_t epsilon_;
+  int wg_size_;
   sycl_local_acc_t<accscalar_t> shmem_mean_;
   sycl_local_acc_t<accscalar_t> shmem_m2n_;
   sycl_local_acc_t<int> shmem_count_;
@@ -1039,7 +1080,7 @@ void batch_norm_stats_channels_last_template(
   int* semaphores_ptr =
       nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
 
-  auto caller = BatchNormCollectStatisticsChannelsLastKernelFunctor<
+  auto kfn = BatchNormCollectStatisticsChannelsLastKernelFunctor<
       VarTransform,
       scalar_t,
       accscalar_t,
@@ -1051,8 +1092,10 @@ void batch_norm_stats_channels_last_template(
       semaphores_ptr,
       reduction_size,
       stride,
-      epsilon);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+      epsilon,
+      wg_size_y * wg_size_x);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 std::tuple<Tensor, Tensor> batch_norm_stats_kernel(
@@ -1254,8 +1297,8 @@ void batch_norm_elemt_template(
   const double dummy_epsilon = 1e-5;
 
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   sycl::range<2> local_range(tb, tf);
   int nwg_x = input.size(1);
@@ -1266,14 +1309,14 @@ void batch_norm_elemt_template(
   nwg_y = std::min<int>(nwg_y, syclMaxWorkItemsPerTile() / (tf * tb));
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormTransformInputKernelFunctor<
+  auto kfn = BatchNormTransformInputKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
       true,
       index_t>(input, output, mean, invstd, weight, bias, dummy_epsilon);
 
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -1388,7 +1431,7 @@ void batch_norm_elemt_channels_last_template(
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
           using accscalar_t = at::acc_type<scalar_t, true>;
-          auto caller = BatchNormTransformInputChannelsLastKernelFunctor<
+          auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
               scalar_t,
               accscalar_t,
               accscalar_t,
@@ -1403,7 +1446,7 @@ void batch_norm_elemt_channels_last_template(
               reduction_size,
               stride,
               fuse_relu);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -1417,7 +1460,7 @@ void batch_norm_elemt_channels_last_template(
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward_xpu", [&] {
           using accscalar_t = at::acc_type<scalar_t, true>;
-          auto caller = BatchNormTransformInputChannelsLastKernelFunctor<
+          auto kfn = BatchNormTransformInputChannelsLastKernelFunctor<
               scalar_t,
               accscalar_t,
               scalar_t,
@@ -1432,7 +1475,7 @@ void batch_norm_elemt_channels_last_template(
               reduction_size,
               stride,
               fuse_relu);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 }
@@ -1599,7 +1642,7 @@ struct BatchNormBackwardReduceKernelFunctor
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
     local_sum_ = sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>>(
-        sycl::range<1>{(size_t)get_max_group_size(SIMD)}, cgh);
+        sycl::range<1>{(size_t)wg_size_}, cgh);
   }
 
   BatchNormBackwardReduceKernelFunctor(
@@ -1636,7 +1679,8 @@ struct BatchNormBackwardReduceKernelFunctor
       GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
           grad_weight,
       GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
-          grad_bias)
+          grad_bias,
+      int wg_size)
       : input_(input),
         grad_output_(grad_output),
         mean_(mean),
@@ -1644,7 +1688,8 @@ struct BatchNormBackwardReduceKernelFunctor
         sum_dy_(sum_dy),
         sum_dy_xmu_(sum_dy_xmu),
         grad_weight_(grad_weight),
-        grad_bias_(grad_bias) {}
+        grad_bias_(grad_bias),
+        wg_size_(wg_size) {}
 
  private:
   const GenericPackedTensorAccessor<
@@ -1671,6 +1716,7 @@ struct BatchNormBackwardReduceKernelFunctor
       grad_weight_;
   GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t>
       grad_bias_;
+  int wg_size_;
   sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>> local_sum_;
 };
 
@@ -1739,21 +1785,24 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
   auto& queue = getCurrentSYCLQueue();
   int simd = get_prefer_simd(
       input_reshaped.size(1), input_reshaped.size(0) * input_reshaped.size(1));
-  int max_wg_size = get_max_group_size(simd);
-  int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
-  int wg_size_x = std::min<int>(
-      std::max<int>(get_num_threads(feature_size, simd), simd),
-      max_wg_size / wg_size_y);
-  sycl::range<2> local_range(wg_size_y, wg_size_x);
-  sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
 
   if (simd == SIMD32) {
-    auto caller = BatchNormBackwardReduceKernelFunctor<
+    using KernelClass = BatchNormBackwardReduceKernelFunctor<
         SIMD32,
         input_scalar_t,
         stat_scalar_t,
         stat_accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_wg_size = get_max_group_size<KernelClass>(simd);
+    int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
+    int wg_size_x = std::min<int>(
+        std::max<int>(get_num_threads<KernelClass>(feature_size, simd), simd),
+        max_wg_size / wg_size_y);
+    sycl::range<2> local_range(wg_size_y, wg_size_x);
+    sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         mean,
@@ -1761,15 +1810,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
         sum_dy,
         sum_dy_xmu,
         grad_weight,
-        grad_bias);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        grad_bias,
+        wg_size_y * wg_size_x);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
-    auto caller = BatchNormBackwardReduceKernelFunctor<
+    using KernelClass = BatchNormBackwardReduceKernelFunctor<
         SIMD16,
         input_scalar_t,
         stat_scalar_t,
         stat_accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_wg_size = get_max_group_size<KernelClass>(simd);
+    int wg_size_y = std::min<int>(last_pow2(batch_size), max_wg_size / simd);
+    int wg_size_x = std::min<int>(
+        std::max<int>(get_num_threads<KernelClass>(feature_size, simd), simd),
+        max_wg_size / wg_size_y);
+    sycl::range<2> local_range(wg_size_y, wg_size_x);
+    sycl::range<2> global_range(1 * wg_size_y, n_input * wg_size_x);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         mean,
@@ -1777,8 +1838,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_template(
         sum_dy,
         sum_dy_xmu,
         grad_weight,
-        grad_bias);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        grad_bias,
+        wg_size_y * wg_size_x);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
   return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
 }
@@ -1961,10 +2024,10 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    shmem_sum_dy_ = sycl_local_acc_t<accscalar_t>(
-        sycl::range<1>{(size_t)get_max_group_size()}, cgh);
-    shmem_sum_dy_xmu_ = sycl_local_acc_t<accscalar_t>(
-        sycl::range<1>{(size_t)get_max_group_size()}, cgh);
+    shmem_sum_dy_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
+    shmem_sum_dy_xmu_ =
+        sycl_local_acc_t<accscalar_t>(sycl::range<1>{(size_t)wg_size_}, cgh);
     is_last_group_done_ = sycl_local_acc_t<bool>(sycl::range<1>{1}, cgh);
   }
 
@@ -1980,7 +2043,8 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
       volatile accscalar_t* staging_data,
       int* semaphores,
       const int reduction_size,
-      const int stride)
+      const int stride,
+      const int wg_size)
       : input_(input),
         grad_output_(grad_output),
         mean_(mean),
@@ -1992,7 +2056,8 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
         staging_data_(staging_data),
         semaphores_(semaphores),
         reduction_size_(reduction_size),
-        stride_(stride) {}
+        stride_(stride),
+        wg_size_(wg_size) {}
 
  private:
   const scalar_t* __restrict__ input_;
@@ -2007,6 +2072,7 @@ struct BatchNormBackwardReduceChannelsLastKernelFunctor
   int* semaphores_;
   const int reduction_size_;
   const int stride_;
+  const int wg_size_;
   sycl_local_acc_t<accscalar_t> shmem_sum_dy_;
   sycl_local_acc_t<accscalar_t> shmem_sum_dy_xmu_;
   sycl_local_acc_t<bool> is_last_group_done_;
@@ -2069,7 +2135,7 @@ batch_norm_backward_reduce_channels_last_template(
               : nullptr;
           int* semaphores_ptr =
               nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
-          auto caller = BatchNormBackwardReduceChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardReduceChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2085,8 +2151,9 @@ batch_norm_backward_reduce_channels_last_template(
               staging_data_ptr,
               semaphores_ptr,
               reduction_size,
-              stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+              stride,
+              wg_size_y * wg_size_x);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -2109,7 +2176,8 @@ batch_norm_backward_reduce_channels_last_template(
               : nullptr;
           int* semaphores_ptr =
               nwg_y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
-          auto caller = BatchNormBackwardReduceChannelsLastKernelFunctor<
+
+          auto kfn = BatchNormBackwardReduceChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2127,8 +2195,10 @@ batch_norm_backward_reduce_channels_last_template(
               staging_data_ptr,
               semaphores_ptr,
               reduction_size,
-              stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+              stride,
+              wg_size_y * wg_size_x);
+
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 
@@ -2429,8 +2499,8 @@ Tensor batch_norm_backward_elemt_template(
 
   auto& queue = getCurrentSYCLQueue();
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   int nwg_x = input.size(1);
   int nwg_y = std::max<int>(
@@ -2444,7 +2514,7 @@ Tensor batch_norm_backward_elemt_template(
   sycl::range<2> local_range(tb, tf);
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormBackwardElemtKernelFunctor<
+  auto kfn = BatchNormBackwardElemtKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
@@ -2458,7 +2528,7 @@ Tensor batch_norm_backward_elemt_template(
       sum_dy_xmu,
       grad_input,
       norm_fct);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 
   return grad_input_reshaped.view(input_.sizes());
 }
@@ -2510,8 +2580,8 @@ Tensor batch_norm_backward_elemt_template(
   auto& queue = getCurrentSYCLQueue();
 
   int tf = std::max<int>(
-      get_num_threads(input.size(2) / 4),
-      std::min<int>(get_num_threads(input.size(2)), 64));
+      get_num_threads_by_dev_max_group_size(input.size(2) / 4),
+      std::min<int>(get_num_threads_by_dev_max_group_size(input.size(2)), 64));
   int tb = std::max<int>(64 / tf, 1);
   int nwg_x = input.size(1);
   int nwg_y = std::max<int>(
@@ -2523,7 +2593,7 @@ Tensor batch_norm_backward_elemt_template(
   sycl::range<2> local_range(tb, tf);
   sycl::range<2> global_range(nwg_y * tb, nwg_x * tf);
 
-  auto caller = BatchNormBackwardElemtKernelFunctor<
+  auto kfn = BatchNormBackwardElemtKernelFunctor<
       input_scalar_t,
       stat_scalar_t,
       stat_accscalar_t,
@@ -2540,7 +2610,7 @@ Tensor batch_norm_backward_elemt_template(
       0,
       count.const_data_ptr<int>(),
       count.numel());
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 
   return grad_input_reshaped.view(input_.sizes());
 }
@@ -2681,7 +2751,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         using accscalar_t = at::acc_type<scalar_t, true>;
 
         if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2697,9 +2767,9 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               static_cast<accscalar_t>(norm_fct),
               reduction_size,
               stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         } else {
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2715,7 +2785,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               static_cast<accscalar_t>(norm_fct),
               reduction_size,
               stride);
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         }
       });
 
@@ -2751,7 +2821,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         "batchnorm_backward_element_xpu",
         [&] {
           using accscalar_t = acc_type<scalar_t, true>;
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2770,7 +2840,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               stride,
               count.const_data_ptr<int>(),
               count.numel());
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   } else {
     if (weight.defined()) {
@@ -2788,7 +2858,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
         "batchnorm_backward_element_xpu",
         [&] {
           using accscalar_t = acc_type<scalar_t, true>;
-          auto caller = BatchNormBackwardElemtChannelsLastKernelFunctor<
+          auto kfn = BatchNormBackwardElemtChannelsLastKernelFunctor<
               ELEMENTS_PER_ITER,
               scalar_t,
               accscalar_t,
@@ -2807,7 +2877,7 @@ at::Tensor batch_norm_backward_elemt_channels_last_template(
               stride,
               count.const_data_ptr<int>(),
               count.numel());
-          sycl_kernel_submit(global_range, local_range, queue, caller);
+          sycl_kernel_submit(global_range, local_range, queue, kfn);
         });
   }
 
@@ -3373,7 +3443,7 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
     local_sum_ = sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>>(
-        sycl::range<1>{(size_t)get_max_group_size(SIMD)}, cgh);
+        sycl::range<1>{(size_t)wg_size_}, cgh);
   }
 
   BatchNormBackwardKernelFunctor(
@@ -3419,7 +3489,8 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
           DefaultPtrTraits,
           index_t> save_invstd,
       bool train,
-      stat_accscalar_t epsilon)
+      stat_accscalar_t epsilon,
+      int wg_size)
       : input_(input),
         grad_output_(grad_output),
         grad_input_(grad_input),
@@ -3431,7 +3502,8 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
         save_mean_(save_mean),
         save_invstd_(save_invstd),
         train_(train),
-        epsilon_(epsilon) {}
+        epsilon_(epsilon),
+        wg_size_(wg_size) {}
 
  private:
   const GenericPackedTensorAccessor<
@@ -3484,6 +3556,7 @@ struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
       save_invstd_;
   bool train_;
   stat_accscalar_t epsilon_;
+  int wg_size_;
   sycl_local_acc_t<Float2<input_scalar_t, stat_accscalar_t>> local_sum_;
 };
 
@@ -3559,19 +3632,22 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
       input_reshaped.size(1), input_reshaped.size(0) * input_reshaped.size(1));
 
   auto& queue = getCurrentSYCLQueue();
-  int max_group_size = get_max_group_size(simd);
-  int tf = get_num_threads(input.size(2), simd);
-  int wg_sz_y = std::max<int>(1, max_group_size / tf);
-  sycl::range<2> local_range(wg_sz_y, tf);
-  sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
 
   if (simd == SIMD32) {
-    auto caller = BatchNormBackwardKernelFunctor<
+    using KernelClass = BatchNormBackwardKernelFunctor<
         SIMD32,
         input_scalar_t,
         stat_scalar_t,
         accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int wg_sz_y = std::max<int>(1, max_group_size / tf);
+    sycl::range<2> local_range(wg_sz_y, tf);
+    sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         grad_input,
@@ -3583,15 +3659,25 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
         save_mean,
         save_invstd,
         train,
-        epsilon);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        epsilon,
+        wg_sz_y * tf);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
-    auto caller = BatchNormBackwardKernelFunctor<
+    using KernelClass = BatchNormBackwardKernelFunctor<
         SIMD16,
         input_scalar_t,
         stat_scalar_t,
         accscalar_t,
-        index_t>(
+        index_t>;
+
+    int max_group_size = get_max_group_size<KernelClass>(simd);
+    int tf = get_num_threads<KernelClass>(input.size(2), simd);
+    int wg_sz_y = std::max<int>(1, max_group_size / tf);
+    sycl::range<2> local_range(wg_sz_y, tf);
+    sycl::range<2> global_range(1 * wg_sz_y, input.size(1) * tf);
+
+    auto kfn = KernelClass(
         input,
         grad_output,
         grad_input,
@@ -3603,8 +3689,10 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_template(
         save_mean,
         save_invstd,
         train,
-        epsilon);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+        epsilon,
+        wg_sz_y * tf);
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
   return std::make_tuple(grad_input_, grad_weight_, grad_bias_);
 }
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
index 213283de0..d56eff222 100644
--- a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -125,23 +125,6 @@ void searchsorted_template(
     const bool& right,
     const Tensor& sorter) {
   int64_t numel_in = input.numel();
-  int64_t rng, grng, tile_size;
-  tile_size = syclMaxWorkGroupSize();
-  rng = numel_in;
-  if (rng == 0) {
-    rng = static_cast<int64_t>(1);
-  }
-
-  grng = rng;
-  if (tile_size > grng) {
-    tile_size = grng;
-  } else if (grng > tile_size) {
-    int64_t xMode = static_cast<int64_t>(grng % tile_size);
-    if (xMode != 0) {
-      grng += static_cast<int64_t>(tile_size - xMode);
-    }
-  }
-
   bool is_scalar_input = input.dim() == 0 && numel_in == 1;
   // inner most dim size of input and boundaries
   int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
@@ -167,6 +150,23 @@ void searchsorted_template(
       data_bd_data,
       data_out_data);
 
+  int64_t rng, grng, tile_size;
+  tile_size = syclMaxWorkGroupSize(kfn);
+  rng = numel_in;
+  if (rng == 0) {
+    rng = static_cast<int64_t>(1);
+  }
+
+  grng = rng;
+  if (tile_size > grng) {
+    tile_size = grng;
+  } else if (grng > tile_size) {
+    int64_t xMode = static_cast<int64_t>(grng % tile_size);
+    if (xMode != 0) {
+      grng += static_cast<int64_t>(tile_size - xMode);
+    }
+  }
+
   sycl_kernel_submit(grng, tile_size, getCurrentSYCLQueue(), kfn);
 }
 
@@ -243,4 +243,4 @@ void searchsorted_kernel(
     result.copy_(out);
   }
 }
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
index 009d86859..358b3ed84 100644
--- a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -366,12 +366,15 @@ void launch_max_pool2d_kernel(
     int padW,
     int dilationH,
     int dilationW) {
+  using KernelClass = MaxPool2dKernelFunctor<scalar_t, is_channels_last>;
+
   auto& queue = at::xpu::getCurrentSYCLQueue();
   int outputSize = numBatch * numPlane * outputSizeH * outputSizeW;
   int stride = numPlane * outputSizeH * outputSizeW;
   BatchKernelConfig cfg = {
       1, outputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-  auto kfn = MaxPool2dKernelFunctor<scalar_t, is_channels_last>(
+  cfg.template build<KernelClass>();
+  auto kfn = KernelClass(
       output,
       indices,
       input,
@@ -423,37 +426,42 @@ void launch_max_pool2d_backward_kernel(
   if (globalContext().deterministicAlgorithms() ||
       std::is_same_v<scalar_t, at::Half> ||
       std::is_same_v<scalar_t, at::BFloat16>) {
+    using KernelClass =
+        MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
     BatchKernelConfig cfg = {
         1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    auto kfn =
-        MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>(
-            gradInput,
-            gradOutput,
-            indices,
-            numPlane,
-            gradInputSizeH,
-            gradInputSizeW,
-            gradOutputSizeH,
-            gradOutputSizeW,
-            gradInputSize,
-            out_cf_c_stride,
-            in_cf_c_stride,
-            out_n_stride,
-            in_n_stride,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            pad_h,
-            pad_w,
-            dilation_h,
-            dilation_w,
-            cfg);
+    cfg.template build<KernelClass>();
+    auto kfn = KernelClass(
+        gradInput,
+        gradOutput,
+        indices,
+        numPlane,
+        gradInputSizeH,
+        gradInputSizeW,
+        gradOutputSizeH,
+        gradOutputSizeW,
+        gradInputSize,
+        out_cf_c_stride,
+        in_cf_c_stride,
+        out_n_stride,
+        in_n_stride,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        cfg);
     sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
   } else {
+    using KernelClass =
+        MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
     BatchKernelConfig cfg = {
         1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    auto kfn = MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>(
+    cfg.template build<KernelClass>();
+    auto kfn = KernelClass(
         gradInput,
         gradOutput,
         indices,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
index a68c7a05b..b7d65796f 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
+++ b/src/ATen/native/xpu/sycl/EmbeddingBackwardKernel.h
@@ -228,15 +228,6 @@ void compute_grad_weight_bags(
     const Tensor& segment_offsets,
     int64_t num_of_segments,
     const Tensor& grad_weight_per_segment) {
-  constexpr int SYCL_MAX_SUB_GROUP_SIZE = 32;
-  int64_t work_group_size = syclMaxWorkGroupSize();
-
-  int64_t stride_warped =
-      CeilDiv(stride, SYCL_MAX_SUB_GROUP_SIZE) * SYCL_MAX_SUB_GROUP_SIZE;
-  int64_t group_size = std::min(stride_warped, work_group_size);
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   bool per_sample_weight_defined = per_sample_weights.defined();
   bool count_defined = count.defined();
   int64_t per_sample_weights_stride =
@@ -258,10 +249,11 @@ void compute_grad_weight_bags(
                          // buffer.
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
+  int64_t max_sub_group_size = syclMaxSubGroupSize();
+  int64_t stride_warped =
+      CeilDiv(stride, max_sub_group_size) * max_sub_group_size;
 
-  auto caller = ComputeGradWeightBagsKernelFunctor<scalar_t, index_t>(
+  auto kfn = ComputeGradWeightBagsKernelFunctor<scalar_t, index_t>(
       numel,
       stride,
       mode_mean,
@@ -278,7 +270,15 @@ void compute_grad_weight_bags(
       bag_size_data,
       per_sample_weights_data,
       segment_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 template <typename scalar_t, typename index_t>
@@ -358,14 +358,6 @@ void compute_grad_weight(
     const Tensor& segment_offsets,
     int64_t num_of_segments,
     const Tensor& grad_weight_per_segment) {
-  constexpr int SYCL_MAX_SUB_GROUP_SIZE = 32;
-  int64_t work_group_size = syclMaxWorkGroupSize();
-  int64_t stride_warped =
-      CeilDiv(stride, SYCL_MAX_SUB_GROUP_SIZE) * SYCL_MAX_SUB_GROUP_SIZE;
-  int64_t group_size = std::min(stride_warped, work_group_size);
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   bool count_defined = count.defined();
 
   auto grad_weight_per_segment_data =
@@ -377,10 +369,11 @@ void compute_grad_weight(
       : indices_data; // use the indices_data handler as the dummy buffer.
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
+  int64_t max_sub_group_size = syclMaxSubGroupSize();
+  int64_t stride_warped =
+      CeilDiv(stride, max_sub_group_size) * max_sub_group_size;
 
-  auto caller = ComputeGradWeightKernelFunctor<scalar_t, index_t>(
+  auto kfn = ComputeGradWeightKernelFunctor<scalar_t, index_t>(
       numel,
       stride,
       num_of_segments,
@@ -391,7 +384,15 @@ void compute_grad_weight(
       grad_output_data,
       count_data,
       segment_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 template <typename scalar_t, typename index_t>
@@ -449,6 +450,10 @@ struct SumAndScatterKernelFunctor {
         grad_weight_per_segment_data_(grad_weight_per_segment_data),
         segment_sizes_offsets_data_(segment_sizes_offsets_data) {}
 
+  void set_stride_warped(int64_t stride_warped) {
+    stride_warped_ = stride_warped;
+  }
+
  private:
   int64_t stride_;
   int64_t num_of_segments_;
@@ -473,12 +478,6 @@ void sum_and_scatter(
     const Tensor& segment_sizes_offsets,
     int64_t num_of_partial_segments,
     const int64_t padding_idx) {
-  int64_t work_group_size = syclMaxWorkGroupSize();
-  int64_t stride_warped = CeilDiv(stride, work_group_size) * work_group_size;
-  int64_t group_size = std::min(stride_warped, syclMaxWorkGroupSize());
-  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
-  auto total_items = num_groups * group_size;
-
   auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
   auto input_data = input.data_ptr<index_t>();
   auto segment_offsets_data = segment_offsets.data_ptr<index_t>();
@@ -486,20 +485,29 @@ void sum_and_scatter(
       grad_weight_per_segment.data_ptr<acc_type<scalar_t, true>>();
   auto segment_sizes_offsets_data = segment_sizes_offsets.data_ptr<index_t>();
 
-  auto global_range = sycl::range<1>((size_t)total_items);
-  auto local_range = sycl::range<1>((size_t)group_size);
-  auto caller = SumAndScatterKernelFunctor<scalar_t, index_t>(
+  auto kfn = SumAndScatterKernelFunctor<scalar_t, index_t>(
       stride,
       num_of_segments,
       num_of_partial_segments,
       padding_idx,
-      stride_warped,
+      /* stride_warped */ 0,
       grad_weight_data,
       input_data,
       segment_offsets_data,
       grad_weight_per_segment_data,
       segment_sizes_offsets_data);
-  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), caller);
+
+  int64_t work_group_size = syclMaxWorkGroupSize(kfn);
+  int64_t stride_warped = CeilDiv(stride, work_group_size) * work_group_size;
+  kfn.set_stride_warped(stride_warped);
+
+  int64_t group_size = std::min(stride_warped, work_group_size);
+  auto num_groups = CeilDiv(num_of_segments * stride_warped, group_size);
+  auto total_items = num_groups * group_size;
+  auto global_range = sycl::range<1>((size_t)total_items);
+  auto local_range = sycl::range<1>((size_t)group_size);
+
+  sycl_kernel_submit(global_range, local_range, getCurrentSYCLQueue(), kfn);
 }
 
 struct EmbeddingBackwardDeterministicKernelCopyIfFunctor {
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 751dea41c..9b7366781 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -42,6 +42,15 @@ void embedding_bag(
   using vec_t = at::detail::Array<scalar_t, vec_size>;
   using vec_acc_t = at::detail::Array<accscalar_t, vec_size>;
   using vec_idx_t = at::detail::Array<index_t, vec_size>;
+  using KernelClass = EmbeddingBagKernelFunctor<
+      scalar_t,
+      accscalar_t,
+      index_t,
+      mode,
+      vec_size,
+      vec_t,
+      vec_acc_t,
+      vec_idx_t>;
 
   vec_t* o_vec = reinterpret_cast<vec_t*>(output);
   vec_t* w_vec = reinterpret_cast<vec_t*>(weights);
@@ -50,16 +59,10 @@ void embedding_bag(
   vec_len = vec_len / vec_size;
   BatchKernelConfig cfg = {
       bag_num, vec_len, 1, bag_num, true, BatchKernelConfig::Policy::pAdaptive};
+  cfg.template build<KernelClass>();
+
   index_t fixing_bag_size = ignore_offsets ? index_size / bag_num : 0;
-  auto caller = EmbeddingBagKernelFunctor<
-      scalar_t,
-      accscalar_t,
-      index_t,
-      mode,
-      vec_size,
-      vec_t,
-      vec_acc_t,
-      vec_idx_t>(
+  auto kfn = KernelClass(
       index,
       offset,
       offset2bag,
@@ -77,7 +80,7 @@ void embedding_bag(
       cfg,
       fixing_bag_size);
   sycl_kernel_submit(
-      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), caller);
+      cfg.global_size(), cfg.group_size(), getCurrentSYCLQueue(), kfn);
 }
 
 #define EMBBAG_KERNEL_ACC(                                                   \
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 2ad35e5d9..104147f71 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -157,16 +157,18 @@ void launch_lpnorm_chunk_reduce_kernel(
                   AT_PRIVATE_CASE_TYPE_USING_HINT(          \
                       at::ScalarType::BFloat16, out_t, __VA_ARGS__))
 
-std::vector<Tensor> foreach_norm_kernel(
+template <class KernelClass>
+void foreach_norn_kernel_config(
     TensorList tensors,
-    const Scalar& ord,
-    double p,
-    c10::optional<ScalarType> dtype) {
+    TensorOptions output_per_tensor_option,
+    int64_t& wg_size,
+    int& max_chunks_per_tensor,
+    Tensor& output_per_tensor) {
   const int ntensors = tensors.size();
-  int max_chunks_per_tensor = -1;
 
-  int64_t wg_size = multi_tensor_apply_kernel_get_wg_size();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  max_chunks_per_tensor = -1;
+  wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   for (int t = 0; t < ntensors; t++) {
     int max_chunks_this_tensor =
@@ -176,18 +178,29 @@ std::vector<Tensor> foreach_norm_kernel(
     }
   }
 
-  const auto options = tensors[0].options();
-  const ScalarType output_dtype = // tensors[0].scalar_type();
-      dtype.has_value() ? dtype.value() : tensors[0].scalar_type();
-  const ScalarType output_per_tensor_dtype = toOpMathType(output_dtype);
-  auto output_per_tensor = at::zeros(
+  output_per_tensor = at::zeros(
       {static_cast<int64_t>(ntensors) * max_chunks_per_tensor},
-      options.dtype(output_per_tensor_dtype));
+      output_per_tensor_option);
+}
 
-  const auto res_option = options.dtype(output_dtype);
-  auto ret_per_tensor = at::empty({ntensors}, res_option);
+std::vector<Tensor> foreach_norm_kernel(
+    TensorList tensors,
+    const Scalar& ord,
+    double p,
+    c10::optional<ScalarType> dtype) {
+  const int ntensors = tensors.size();
 
+  const ScalarType output_dtype = // tensors[0].scalar_type();
+      dtype.has_value() ? dtype.value() : tensors[0].scalar_type();
+  const auto options = tensors[0].options();
+  auto output_per_tensor_option = options.dtype(toOpMathType(output_dtype));
+  auto res_option = options.dtype(output_dtype);
+  auto ret_per_tensor = at::empty({ntensors}, res_option);
   auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
+
+  int64_t wg_size;
+  int max_chunks_per_tensor;
+  Tensor output_per_tensor;
   if (p == static_cast<double>(1)) {
     AT_DISPATCH_FLOATING_TYPES_AND2(
         kHalf,
@@ -198,12 +211,24 @@ std::vector<Tensor> foreach_norm_kernel(
           AT_DISPATCH_OUT_DTYPES(
               output_dtype, "foreach_norm_out_dtype_xpu", [&]() {
                 using out_opmath_t = typename at::opmath_type<out_t>;
+                using KernelClass = lpnormChunkReduceKernelFunctor<
+                    out_t,
+                    NormType::L1,
+                    out_opmath_t>;
+                foreach_norn_kernel_config<KernelClass>(
+                    tensors,
+                    output_per_tensor_option,
+                    wg_size,
+                    max_chunks_per_tensor,
+                    output_per_tensor);
+
                 // sum temp val for each chunk
                 multi_tensor_apply<1>(
                     tensor_lists,
                     LpNormFunctor<scalar_t, NormType::L1, out_opmath_t>(),
                     output_per_tensor.mutable_data_ptr<out_opmath_t>(),
                     max_chunks_per_tensor);
+
                 // sum final val for all chunks
                 launch_lpnorm_chunk_reduce_kernel<
                     out_t,
@@ -226,11 +251,23 @@ std::vector<Tensor> foreach_norm_kernel(
           AT_DISPATCH_OUT_DTYPES(
               output_dtype, "foreach_norm_out_dtype_xpu", [&]() {
                 using out_opmath_t = typename at::opmath_type<out_t>;
+                using KernelClass = lpnormChunkReduceKernelFunctor<
+                    out_t,
+                    NormType::L2,
+                    out_opmath_t>;
+                foreach_norn_kernel_config<KernelClass>(
+                    tensors,
+                    output_per_tensor_option,
+                    wg_size,
+                    max_chunks_per_tensor,
+                    output_per_tensor);
+
                 multi_tensor_apply<1>(
                     tensor_lists,
                     LpNormFunctor<scalar_t, NormType::L2, out_opmath_t>(),
                     output_per_tensor.mutable_data_ptr<out_opmath_t>(),
                     max_chunks_per_tensor);
+
                 launch_lpnorm_chunk_reduce_kernel<
                     out_t,
                     NormType::L2,
diff --git a/src/ATen/native/xpu/sycl/GridSampler.cpp b/src/ATen/native/xpu/sycl/GridSampler.cpp
index 746e2b035..9427fd4ce 100644
--- a/src/ATen/native/xpu/sycl/GridSampler.cpp
+++ b/src/ATen/native/xpu/sycl/GridSampler.cpp
@@ -252,10 +252,6 @@ void grid_sampler_2d_forward_template(
     const GridSamplerInterpolation interpolation_mode,
     const GridSamplerPadding padding_mode,
     bool align_corners) {
-  auto& queue = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
-
   index_t C = input.sizes[1];
   index_t inp_H = input.sizes[2];
   index_t inp_W = input.sizes[3];
@@ -299,6 +295,11 @@ void grid_sampler_2d_forward_template(
       out_sC,
       out_sH,
       out_sW);
+
+  const auto wgroup_size = syclMaxWorkGroupSize(kfn);
+  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(ngroups * wgroup_size),
       sycl::range<1>(wgroup_size),
@@ -700,10 +701,6 @@ void grid_sampler_2d_backward_template(
     const GridSamplerPadding padding_mode,
     bool align_corners,
     const bool input_requires_grad) {
-  auto& queue = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
-
   index_t C = input.sizes[1];
   index_t inp_H = input.sizes[2];
   index_t inp_W = input.sizes[3];
@@ -768,6 +765,11 @@ void grid_sampler_2d_backward_template(
       gInp_sH,
       gInp_sW,
       gGrid_sW);
+
+  const auto wgroup_size = syclMaxWorkGroupSize(kfn);
+  const auto ngroups = (nthreads + wgroup_size - 1) / wgroup_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(ngroups * wgroup_size),
       sycl::range<1>(wgroup_size),
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 08adcd608..ae9bf0005 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -257,9 +257,9 @@ void group_norm_kernel_impl(
 
   auto& queue = getCurrentSYCLQueue();
   int64_t simd = syclMaxSubGroupSize();
-  const int64_t wg_size = D * HxW < get_group_reduce_group_size()
+  const int64_t wg_size = D * HxW < get_group_reduce_group_size(simd)
       ? simd
-      : get_group_reduce_group_size();
+      : get_group_reduce_group_size(simd);
   int64_t nwg = N * G;
   auto global_range = sycl::range<1>(nwg * wg_size);
   auto local_range = sycl::range<1>(wg_size);
@@ -384,8 +384,10 @@ struct Compute1dBackwardFusedParamsFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   Compute1dBackwardFusedParamsFunctor(
@@ -664,9 +666,9 @@ void group_norm_1d_backward(
     T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
     T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
 
-    const int64_t wg_size = (C / G) < get_group_reduce_group_size()
+    const int64_t wg_size = (C / G) < get_group_reduce_group_size(simd)
         ? simd
-        : get_group_reduce_group_size();
+        : get_group_reduce_group_size(simd);
     auto global_range = sycl::range<2>(G, N * wg_size);
     auto local_range = sycl::range<2>(1, wg_size);
     group_norm_kernel_simd_choice_and_launch<
@@ -717,7 +719,7 @@ void group_norm_1d_backward(
     T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
     if (N <= 128) {
-      const int64_t wg_size = get_group_reduce_group_size();
+      const int64_t wg_size = get_group_reduce_group_size(simd);
       const int64_t B = (C + wg_size - 1) / wg_size;
       auto caller = GammaBeta1dBackwardSmallKernel<T>(
           N,
@@ -784,8 +786,10 @@ struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   ComputeInternalGradientsFunctor(
@@ -857,8 +861,10 @@ struct ComputeBackwardFusedParamsFunctor
   }
 
   void sycl_ker_config_convention(sycl::handler& cgh) {
-    ds_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
-    db_shared_ = sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(), cgh);
+    ds_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
+    db_shared_ =
+        sycl_local_acc_t<T_ACC>(get_group_reduce_group_size(SIMD), cgh);
   }
 
   ComputeBackwardFusedParamsFunctor(
@@ -1144,9 +1150,9 @@ void group_norm_backward_kernel_impl(
   auto& queue = getCurrentSYCLQueue();
 
   int64_t simd = syclMaxSubGroupSize();
-  int64_t wg_size = HxW < get_group_reduce_group_size()
+  int64_t wg_size = HxW < get_group_reduce_group_size(simd)
       ? simd
-      : get_group_reduce_group_size();
+      : get_group_reduce_group_size(simd);
   group_norm_kernel_simd_choice_and_launch<
       ComputeInternalGradientsFunctor<T, SIMD16>,
       ComputeInternalGradientsFunctor<T, SIMD32>>(
@@ -1177,9 +1183,9 @@ void group_norm_backward_kernel_impl(
       gpu_kernel(iter, GroupNormBackwardC1Functor<T, T_ACC>());
     }
 
-    wg_size = (C / G) < get_group_reduce_group_size()
+    wg_size = (C / G) < get_group_reduce_group_size(simd)
         ? simd
-        : get_group_reduce_group_size();
+        : get_group_reduce_group_size(simd);
     group_norm_kernel_simd_choice_and_launch<
         ComputeBackwardFusedParamsFunctor<T, SIMD16>,
         ComputeBackwardFusedParamsFunctor<T, SIMD32>>(
diff --git a/src/ATen/native/xpu/sycl/GroupReduceUtils.h b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
index 95ef90a69..c85877da9 100644
--- a/src/ATen/native/xpu/sycl/GroupReduceUtils.h
+++ b/src/ATen/native/xpu/sycl/GroupReduceUtils.h
@@ -12,8 +12,10 @@ namespace at {
 namespace native {
 namespace xpu {
 
-inline size_t get_group_reduce_group_size() {
-  return syclMaxWorkGroupSize() / 2;
+inline int get_group_reduce_group_size(int simd) {
+  // Limited by group reduce implementation. We use two sub group shuffles,
+  // The second sub group shuffle only could handle simd size elements.
+  return std::min(512, simd * simd);
 }
 
 template <int DIM>
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 5b2981f12..1149a0cfc 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -82,12 +82,28 @@ static inline void _index_select_kernel(
     IdxInfo& index_info,
     int64_t dim) {
   using scalar_t = typename SrcInfo::scalar_t;
-  auto cfg = IndexKernelConfig<
+  using IdxConfig = IndexKernelConfig<
       SrcInfo,
       DstInfo,
       IdxInfo,
-      IndexSelectScalarFunctor<scalar_t>>::
-      make_config(
+      IndexSelectScalarFunctor<scalar_t>>;
+
+  using IndexKnownProblemInnerKernel =
+      IndexKernel<IdxConfig, TrivialOffCal, true>;
+  auto IndexKnownProblemInnerKernel_cfg =
+      IdxConfig::template make_config<IndexKnownProblemInnerKernel>(
+          src_info,
+          dst_info,
+          index_info,
+          static_cast<scalar_t>(0),
+          dim,
+          false,
+          IndexSelectScalarFunctor<scalar_t>());
+
+  using IndexUnknownProblemInnerKernel =
+      IndexKernel<IdxConfig, TrivialOffCal, false>;
+  auto IndexUnknownProblemInnerKernel_cfg =
+      IdxConfig::template make_config<IndexUnknownProblemInnerKernel>(
           src_info,
           dst_info,
           index_info,
@@ -95,10 +111,13 @@ static inline void _index_select_kernel(
           dim,
           false,
           IndexSelectScalarFunctor<scalar_t>());
-  if (cfg.problem_inner_) {
-    launch_index_kernel<decltype(cfg), TrivialOffCal, true>(cfg);
+
+  if (IndexKnownProblemInnerKernel_cfg.problem_inner_) {
+    launch_index_kernel<IdxConfig, TrivialOffCal, true>(
+        IndexKnownProblemInnerKernel_cfg);
   } else {
-    launch_index_kernel<decltype(cfg), TrivialOffCal, false>(cfg);
+    launch_index_kernel<IdxConfig, TrivialOffCal, false>(
+        IndexUnknownProblemInnerKernel_cfg);
   }
 }
 
@@ -390,19 +409,21 @@ void index_add_kernel(
               getTensorInfo<scalar_t, int64_t>(self_);
           int new_indexing_dim = dst_info.collapseDims(dim);
 
-          auto cfg = IndexKernelConfig<
+          using IdxConfig = IndexKernelConfig<
               decltype(src_info),
               decltype(dst_info),
               decltype(index_info),
-              IndexAddScalarFunctor<scalar_t>>::
-              make_config(
-                  src_info,
-                  dst_info,
-                  index_info,
-                  alpha.to<scalar_t>(),
-                  new_indexing_dim,
-                  true,
-                  IndexAddScalarFunctor<scalar_t>());
+              IndexAddScalarFunctor<scalar_t>>;
+          using KernelClass = IndexKernel<IdxConfig, false, false>;
+
+          auto cfg = IdxConfig::template make_config<KernelClass>(
+              src_info,
+              dst_info,
+              index_info,
+              alpha.to<scalar_t>(),
+              new_indexing_dim,
+              true,
+              IndexAddScalarFunctor<scalar_t>());
           launch_index_kernel(cfg);
         });
       });
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index 5ed09de0e..7ce476df5 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -106,6 +106,7 @@ class IndexKernelConfig : public BatchKernelConfig {
     return;
   }
 
+  template <class KernelClass>
   static IndexKernelConfig<SrcInfo, DstInfo, IdxInfo, FuncType> make_config(
       SrcInfo& src_info,
       DstInfo& dst_info,
@@ -154,7 +155,7 @@ class IndexKernelConfig : public BatchKernelConfig {
           problem_inner);
     }
 
-    return {
+    IndexKernelConfig<SrcInfo, DstInfo, IdxInfo, FuncType> cfg = {
         src_info,
         dst_info,
         index_info,
@@ -169,6 +170,9 @@ class IndexKernelConfig : public BatchKernelConfig {
         stride,
         problem_batch,
         problem_along_x};
+
+    cfg.template build<KernelClass>();
+    return cfg;
   }
 
  public:
@@ -512,11 +516,13 @@ void small_index_kernel(
     IntArrayRef non_index_size,
     IntArrayRef non_index_stride,
     const func_t f) {
+  using index_buf_type = char*;
+  using KernelClass = SmallIndexKernelFunctor<func_t, index_buf_type>;
+
   auto numel = iter.numel();
   auto indices_size = iter.tensor(2).size(-1);
   auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t max_group_num = syclMaxDSSNum(dev_id) * OVER_SUBSCRIBE_DSS_FACTOR;
+  int64_t max_group_num = syclMaxDSSNum() * OVER_SUBSCRIBE_DSS_FACTOR;
 
   auto total_index_iter = numel / indices_size;
   max_group_num = std::min(int64_t(total_index_iter / 2), max_group_num);
@@ -529,7 +535,7 @@ void small_index_kernel(
   auto group_numel = group_index_iter * indices_size;
   auto group_numel_tail = (group_index_iter - 1) * indices_size;
 
-  auto wgroup_size = syclMaxWorkGroupSize(dev_id);
+  auto wgroup_size = syclMaxWorkGroupSize<KernelClass>();
   wgroup_size = std::min(decltype(wgroup_size)(group_numel), wgroup_size);
   auto global_size = max_group_num * wgroup_size;
 
@@ -555,13 +561,12 @@ void small_index_kernel(
 
   auto out_data = (char*)iter.data_ptr(0);
   auto in_data = (char*)iter.data_ptr(1);
-  using index_buf_type = decltype((char*)iter.data_ptr(0));
   at::detail::Array<index_buf_type, XPU_MAX_TENSORINFO_DIMS> index_ptrs;
   for (size_t i = 0; i < num_indices; i++) {
     index_ptrs[i] = (char*)iter.data_ptr(i + 2);
   }
 
-  SmallIndexKernelFunctor<func_t, index_buf_type> kfn(
+  KernelClass kfn(
       f,
       indices_size,
       group_num_tail,
@@ -741,16 +746,18 @@ void _index_kernel(
     }
   }
   if (small_index) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int64_t max_group_num = syclMaxDSSNum(dev_id);
-    auto wgroup_size = syclMaxWorkGroupSize(dev_id);
+    using index_buf_type = char*;
+    using KernelClass = SmallIndexKernelFunctor<func_t, index_buf_type>;
+
+    int64_t max_group_num = syclMaxDSSNum();
+    auto wgroup_size = syclMaxWorkGroupSize<KernelClass>();
     auto indices_size = iter.tensor(2).size(-1);
     auto total_index_iter = numel / indices_size;
     auto local_index = numel / max_group_num;
 
     // the max_local_mem_size = 65536B (64KB)
     // TODO: Is this right?
-    auto max_local_mem_size = syclLocalMemSize(dev_id);
+    auto max_local_mem_size = syclLocalMemSize();
     auto indice_table_size = indices_size * sizeof(int64_t);
 
     // check whether the current case satisfying conditions 2,3,4
@@ -870,7 +877,11 @@ void launch_index_put_deterministic_kernel(
 
   // align with precision of CPU backend.
   using accscalar_t = scalar_t; /* acc_type<scalar_t>; */
-  IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t> kfn(
+  using KernelClass = IndexPutDeterministicKernelFunctor<scalar_t, accscalar_t>;
+
+  cfg.template build<KernelClass>();
+
+  KernelClass kfn(
       sorted_indices,
       indices,
       value,
diff --git a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
index e487a426f..f3d6c1f94 100644
--- a/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLLKernel.cpp
@@ -226,6 +226,9 @@ void nll_loss_forward_template(
   int64_t batch_size = input.size(0);
 
   if (reduction == at::Reduction::None && n_dims == 2) {
+    using NllLossForwardNoReduceKernel =
+        NllLossForwardNoReduceKernelFunctor<scalar_t, index_t>;
+
     output.resize_({batch_size});
     total_weight.zero_();
     int64_t target_stride = target.stride(0);
@@ -233,7 +236,7 @@ void nll_loss_forward_template(
     auto weight_cont = weight.defined() ? weight.contiguous() : weight;
 
     auto& queue = getCurrentSYCLQueue();
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossForwardNoReduceKernel>();
     bool has_weight = weight.defined()
         ? true
         : false; // sycl kernel can not accept host pointer
@@ -248,7 +251,7 @@ void nll_loss_forward_template(
         ? weight_cont.data_ptr<scalar_t>()
         : input_data; // use the input as the dummy data.
     auto output_data = output.data_ptr<scalar_t>();
-    NllLossForwardNoReduceKernelFunctor<scalar_t, index_t> kfn(
+    NllLossForwardNoReduceKernel kfn(
         input_data,
         target_data,
         weight_data,
@@ -304,9 +307,12 @@ void nll_loss_forward_template(
 
     sycl_kernel_submit(sycl::range<1>(local_size), queue, kfn);
   } else if (input_cont.dim() == 2) {
+    using NllLossForwardReduce2DKernel =
+        NllLossForwardReduce2DKernelFunctor<scalar_t, index_t>;
+
     int64_t batch_size = input.size(0);
     int n_target = input.size(1);
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossForwardReduce2DKernel>();
     auto input_data = _input_data;
     auto weight_data = has_weight
         ? _weight_data
@@ -527,12 +533,15 @@ static inline void nll_loss_backward_template(
   int64_t batch_size = input.size(0);
 
   if (reduction == at::Reduction::None && n_dims == 2) {
+    using NllLossBackwardNoReduceKernel =
+        NllLossBackwardNoReduceKernelFunctor<scalar_t, index_t>;
+
     int64_t target_stride = target.stride(0);
     check_dim_size(gradOutput, 1, 0, batch_size);
     auto weight_cont = weight.defined() ? weight.contiguous() : weight;
 
     auto& queue = getCurrentSYCLQueue();
-    int64_t local_size = syclMaxWorkGroupSize();
+    int64_t local_size = syclMaxWorkGroupSize<NllLossBackwardNoReduceKernel>();
     int64_t global_size =
         ((batch_size + local_size - 1) / local_size) * local_size;
     bool has_weight = weight.defined() ? true : false;
@@ -547,7 +556,7 @@ static inline void nll_loss_backward_template(
         ? weight_cont.data_ptr<scalar_t>()
         : gradOutput_data; // Use gradOutput handler as dummy weight
     auto gradInput_data = gradInput.data_ptr<scalar_t>();
-    NllLossBackwardNoReduceKernelFunctor<scalar_t, index_t> kfn(
+    NllLossBackwardNoReduceKernel kfn(
         target_data,
         gradOutput_data,
         weight_data,
diff --git a/src/ATen/native/xpu/sycl/MultiTensorApply.h b/src/ATen/native/xpu/sycl/MultiTensorApply.h
index 8f9792a87..0817e40be 100644
--- a/src/ATen/native/xpu/sycl/MultiTensorApply.h
+++ b/src/ATen/native/xpu/sycl/MultiTensorApply.h
@@ -48,12 +48,14 @@ struct TLMetaForWG {
   uint32_t wg_to_chunk;
 };
 
-static inline int64_t multi_tensor_apply_kernel_get_wg_size() {
-  return syclMaxWorkGroupSize();
+template <class KernelClass>
+static int64_t multi_tensor_apply_kernel_get_wg_size() {
+  return syclMaxWorkGroupSize<KernelClass>();
 }
 
-static inline int64_t multi_tensor_apply_kernel_get_chunk_size() {
-  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size();
+template <class KernelClass>
+static int64_t multi_tensor_apply_kernel_get_chunk_size() {
+  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
   return max_wg_size * kElementPerThread;
 }
 
@@ -116,17 +118,18 @@ void launch_multi_tensor_apply_kernel(
     U callable,
     int num_wg,
     ArgTypes... args) {
+  using KernelClass = MultiTensorApplyKernelFunctor<T, Y, U, ArgTypes...>;
+
   auto& q = getCurrentSYCLQueue();
-  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t max_wg_size = multi_tensor_apply_kernel_get_wg_size<KernelClass>();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   if constexpr (fused_kernel) {
     max_wg_size = multi_tensor_apply_fused_kernel_get_wg_size();
     kChunkSize = multi_tensor_apply_fused_kernel_get_chunk_size();
   }
 
-  MultiTensorApplyKernelFunctor<T, Y, U, ArgTypes...> kfn(
-      kChunkSize, tlAddressMeta, tlWGMeta, callable, args...);
+  KernelClass kfn(kChunkSize, tlAddressMeta, tlWGMeta, callable, args...);
 
   sycl_kernel_submit(
       sycl::range<1>(num_wg * max_wg_size),
@@ -141,14 +144,20 @@ void multi_tensor_apply(
     at::ArrayRef<Scalar> scalars,
     T callable,
     ArgTypes... args) {
+  using scalar_vals_t = typename T::opmath_t;
+  using KernelClass = MultiTensorApplyKernelFunctor<
+      TLMetaForAddressScalar<scalar_vals_t, depth>*,
+      TLMetaForWG*,
+      T,
+      ArgTypes...>;
+
   TORCH_CHECK(
       tensor_lists.size() == depth,
       "Number of tensor lists has to match he depth");
   size_t n_tensors = tensor_lists[0].size();
-  using scalar_vals_t = typename T::opmath_t;
 
   auto& q = getCurrentSYCLQueue();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   auto addressStorage = at::empty(
       {(int)(sizeof(TLMetaForAddressScalar<scalar_vals_t, depth>) * n_tensors)},
@@ -221,13 +230,19 @@ void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
     T callable,
     ArgTypes... args) {
+  using KernelClass = MultiTensorApplyKernelFunctor<
+      TLMetaForAddress<depth>*,
+      TLMetaForWG*,
+      T,
+      ArgTypes...>;
+
   TORCH_CHECK(
       tensor_lists.size() == depth,
       "Number of tensor lists has to match he depth");
   size_t n_tensors = tensor_lists[0].size();
 
   auto& q = getCurrentSYCLQueue();
-  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size<KernelClass>();
 
   auto addressStorage = at::empty(
       {(int)(sizeof(TLMetaForAddress<depth>) * n_tensors)},
diff --git a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
index a8dc4e97c..e1c50a263 100644
--- a/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
+++ b/src/ATen/native/xpu/sycl/NonzeroKernel.cpp
@@ -112,13 +112,13 @@ void nonzero_template(const Tensor& self_, Tensor& tensor) {
       }
 
       const int64_t N = num_nonzeros * num_dim;
-      const auto wg_sz = std::min(syclMaxWorkGroupSize(), N);
-      const auto num_wg = (N + wg_sz - 1) / wg_sz;
-
       // restore flatten idx to indices
       FlattenIdxtoRealIdxKernelFunctor kfn(
           N, num_dim, tensor_begin, idx_flat_begin, divisor, sizes);
 
+      const auto wg_sz = std::min(syclMaxWorkGroupSize(kfn), N);
+      const auto num_wg = (N + wg_sz - 1) / wg_sz;
+
       sycl_kernel_submit(wg_sz * num_wg, wg_sz, getCurrentSYCLQueue(), kfn);
 
       // Support non-contiguous/outplace cases
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index d607667fe..da9bdf12b 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -15,6 +15,9 @@ namespace xpu {
 using namespace at::native::memory;
 using namespace at::xpu;
 
+// syclDeviceMaxWorkGroup is allowed for launching Norm kernels, only if SIMD
+// is 32. Related kernels include FusedNormKernelFunctor and
+// RowwiseMomentsKernelFunctor. Don't change SIMD, unless refactor the kernels.
 constexpr int SIMD = 32;
 
 template <
@@ -277,9 +280,8 @@ class NormConfig {
   // get resource size for Reduce problem [batch_size, problem_size]
   // the reduce is performed on problem_size dimension
   void get_workgroup_size() {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_workgroup_size = syclMaxWorkGroupSize(dev_id);
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int max_workgroup_size = syclDeviceMaxWorkGroupSize();
+    int total_resource = syclMaxWorkItemsPerTile();
     workgroup_num = total_resource / max_workgroup_size;
     int max_workgroup_num_foreach = 1;
     workgroup_size = max_workgroup_size;
@@ -311,9 +313,8 @@ class NormConfig {
 
   void get_workgroup_size_row() {
     // enlarge the occupancy, compute the least workgroup_num
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_workgroup_size = syclMaxWorkGroupSize(dev_id);
-    int total_resource = syclMaxWorkItemsPerTile(dev_id);
+    int max_workgroup_size = syclDeviceMaxWorkGroupSize();
+    int total_resource = syclMaxWorkItemsPerTile();
     workgroup_num = total_resource / max_workgroup_size;
 
     int max_block_row = max_workgroup_size / SIMD;
@@ -1048,18 +1049,21 @@ template <
     bool one_moment = false>
 void launch_norm_eltwise_update_kernel(Norm<scalar_t, mean_t, weight_t>& norm) {
   using vec_t = aligned_vector<scalar_t, vec_size>;
-  int total_threads = syclMaxWorkItemsPerTile();
-  auto workgroup_size = syclMaxWorkGroupSize();
-  index_t loops_end = (norm.numel() + vec_size - 1) / vec_size;
-
-  auto kfn = NormEltwiseUpdateKernelFunctor<
+  using KernelClass = NormEltwiseUpdateKernelFunctor<
       scalar_t,
       mean_t,
       weight_t,
       index_t,
       vec_size,
       Norm,
-      vec_t>(norm, loops_end, total_threads);
+      vec_t>;
+
+  int total_threads = syclMaxWorkItemsPerTile();
+  auto workgroup_size = syclMaxWorkGroupSize<KernelClass>();
+  index_t loops_end = (norm.numel() + vec_size - 1) / vec_size;
+
+  auto kfn = KernelClass(norm, loops_end, total_threads);
+
   sycl_kernel_submit(total_threads, workgroup_size, getCurrentSYCLQueue(), kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/RandpermKernel.cpp b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
index 233f4d18b..d151de28c 100644
--- a/src/ATen/native/xpu/sycl/RandpermKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RandpermKernel.cpp
@@ -88,7 +88,8 @@ void randperm_handle_duplicate_keys(
 
   T mask = static_cast<T>((1UL << bits) - 1);
   HandleDuplicateKeysKernelFunctor kfn(keys, data, mask, n, rng_engine_inputs);
-  auto local_range = syclMaxWorkGroupSize() / 2;
+
+  auto local_range = syclMaxWorkGroupSize(kfn) / 2;
   auto num_wg = (n + local_range - 1) / local_range;
   auto global_range = num_wg * local_range;
 
diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index ba0820e7c..1be3a5e93 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -237,9 +237,9 @@ struct ReduceConfig {
   int input_vec_size = 1;
   int output_vec_size = 1;
 
-  template <typename T>
+  template <typename T, class KernelClass>
   void set_group_dimension(int64_t dim0, int64_t dim1) {
-    auto max_wg_sz = syclMaxWorkGroupSize();
+    auto max_wg_sz = syclMaxWorkGroupSize<KernelClass>();
     auto max_sg_sz = syclMaxSubGroupSize();
     const int max_num_items = max_wg_sz / output_vec_size;
     int dim0_pow2 = dim0 < max_num_items ? static_cast<int>(last_pow2(dim0))
@@ -1429,7 +1429,22 @@ inline void gpu_reduce_kernel(
   }
 
   // Adjust group_width and group_height
-  config.set_group_dimension<scalar_t>(dim0, dim1);
+  // Mapping to launch_reduce_kernel
+  using R = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>;
+  switch (config.output_vec_size) {
+    case 4: {
+      config.set_group_dimension<scalar_t, ReduceKernel<4, R>>(dim0, dim1);
+      break;
+    }
+    case 2: {
+      config.set_group_dimension<scalar_t, ReduceKernel<2, R>>(dim0, dim1);
+      break;
+    }
+    default: {
+      config.set_group_dimension<scalar_t, ReduceKernel<1, R>>(dim0, dim1);
+      break;
+    }
+  }
 
   int group_width = config.group_width;
   int group_height = config.group_height;
@@ -1511,20 +1526,20 @@ inline void gpu_reduce_kernel(
   AT_ASSERT(can_use_32bit_indexing);
   auto output_calc = make_output_calculator<uint32_t>(iter);
   auto input_calc = make_input_calculator<uint32_t>(iter);
-  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
-      ops,
-      config,
-      input_calc,
-      output_calc,
-      in_data,
-      out_data,
-      out_data_extra,
-      acc_data,
-      buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
-      buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
-      ident,
-      noutputs,
-      base_idx);
+  auto reduce =
+      R(ops,
+        config,
+        input_calc,
+        output_calc,
+        in_data,
+        out_data,
+        out_data_extra,
+        acc_data,
+        buffer.defined() ? (void*)buffer.data_ptr() : nullptr,
+        buffer.defined() ? (int*)semaphores.data_ptr() : nullptr,
+        ident,
+        noutputs,
+        base_idx);
   reduce.accumulate = iter.should_accumulate();
   reduce.final_output = iter.is_final_output();
 
diff --git a/src/ATen/native/xpu/sycl/ScanUtils.h b/src/ATen/native/xpu/sycl/ScanUtils.h
index 46938522c..6bd6c4dc6 100644
--- a/src/ATen/native/xpu/sycl/ScanUtils.h
+++ b/src/ATen/native/xpu/sycl/ScanUtils.h
@@ -186,14 +186,13 @@ class LoopScanConfig {
         glb_range_y_(0),
         wg_range_x_(0),
         wg_range_y_(0) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    size_t wg_size = syclMaxWorkItemsPerEU(dev_id);
+    size_t wg_size = syclMaxWorkItemsPerEU();
     wg_range_x_ = 32;
     while (problem_ <= wg_range_x_ >> 1) {
       wg_range_x_ = wg_range_x_ >> 1;
     }
     wg_range_y_ = wg_size / wg_range_x_;
-    const auto target_global_size = syclMaxWorkItemsPerTile(dev_id);
+    const auto target_global_size = syclMaxWorkItemsPerTile();
     ;
     const size_t max_work_group_num = target_global_size / wg_size;
     const size_t wg_number =
@@ -392,18 +391,6 @@ class SegmentScanConfig : public BatchKernelConfig {
   using OutputInfoType = OutputInfo;
   using IndicesInfoType = IndicesInfo;
 
-  // // Manually enable copy constructor
-  // SegmentScanConfig(const SegmentScanConfig& cfg_) {
-  //   this->iinfo_ = cfg_.iinfo_;
-  //   this->oinfo_ = cfg_.oinfo_;
-  //   this->idxinfo_ = cfg_.idxinfo_;
-  //   this->init_ = cfg_.init_;
-  //   this->type_ = cfg_.type_;
-  //   this->func_ = cfg_.func_;
-  //   this->carrier_ = cfg_.carrier_;
-  //   this->carrier_idx_ = cfg_.carrier_idx_;
-  // }
-
   SegmentScanConfig() {}
 
   SegmentScanConfig(
@@ -432,6 +419,7 @@ class SegmentScanConfig : public BatchKernelConfig {
         carrier_(nullptr),
         carrier_idx_(nullptr) {}
 
+  template <class KernelClass>
   static SegmentScanConfig<
       InputInfo,
       OutputInfo,
@@ -450,17 +438,22 @@ class SegmentScanConfig : public BatchKernelConfig {
     int64_t stride = input_info.innerSize(scan_dim);
     int64_t problem = input_info.sizes[scan_dim];
     bool problem_along_x = input_info.strides[scan_dim] == 1 ? true : false;
-    return {
-        input_info,
-        output_info,
-        indices_info,
-        batch,
-        problem,
-        stride,
-        problem_along_x,
-        init,
-        type,
-        func};
+
+    SegmentScanConfig<InputInfo, OutputInfo, IndicesInfo, T, BinaryFunction>
+        cfg = {
+            input_info,
+            output_info,
+            indices_info,
+            batch,
+            problem,
+            stride,
+            problem_along_x,
+            init,
+            type,
+            func};
+
+    cfg.template build<KernelClass>();
+    return cfg;
   }
 
   int64_t carrier_size() {
@@ -706,13 +699,21 @@ static inline void _segment_scan_kernel(
     int dim_after_collapse,
     T init,
     BinaryFunction func) {
+  using SSConfig = SegmentScanConfig<
+      InputInfo,
+      OutputInfo,
+      OutputInfo /*not used*/,
+      T,
+      BinaryFunction>;
+  using KernelClass = SegmentScanKernel<SSConfig, TrivialOffCal, TrivialIdxCal>;
+
   auto cfg = SegmentScanConfig<
       InputInfo,
       OutputInfo,
       OutputInfo /*not used*/,
       T,
       BinaryFunction>::
-      make_config(
+      template make_config<KernelClass>(
           input_info,
           output_info,
           output_info /*not used*/,
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index d86b32a8c..345d5078a 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -141,12 +141,19 @@ void CatArrayBatchedCopy(
     const int concatDim,
     IndexType dimStride,
     int batchCounter) {
-  auto& q = getCurrentSYCLQueue();
+  CatArrayBatchedCopyKernelFunctor<
+      Tout,
+      underlying_out_t,
+      Tin,
+      underlying_in_t,
+      IndexType,
+      Dims>
+      kfn(output, inputs, os, concatDim, dimStride);
 
   // Get grid where x dim fills half gpu and y dim is number of tensors.
   // This will have cating two tensors fill the entire grid, but prevent
   // many threads from needlessly load meta data if their sizes is small.
-  int64_t numWI = syclMaxWorkGroupSize();
+  int64_t numWI = syclMaxWorkGroupSize(kfn);
 
   // We set limited numWG to prevent over schedule.
   // numWG = 512 EUs * 8 threads * SIMD lanes 32 / max_compute_units
@@ -162,15 +169,7 @@ void CatArrayBatchedCopy(
     numWG = 128;
   sycl::range<2> global_range(batchCounter, numWG * numWI);
   sycl::range<2> local_range(1, numWI);
-
-  CatArrayBatchedCopyKernelFunctor<
-      Tout,
-      underlying_out_t,
-      Tin,
-      underlying_in_t,
-      IndexType,
-      Dims>
-      kfn(output, inputs, os, concatDim, dimStride);
+  auto& q = getCurrentSYCLQueue();
 
   sycl_kernel_submit(global_range, local_range, q, kfn);
 }
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index b8e380bc0..f0f114cde 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -118,7 +118,7 @@ static inline void softmax_group_reduce_spatial(
   }
 }
 
-template <int SIMD, int vec_size, int NUM>
+template <int SIMD, int vec_size, int NUM, class KernelClass>
 static inline void get_wgroup_size(
     uint64_t dim_size,
     int outer_size,
@@ -127,8 +127,7 @@ static inline void get_wgroup_size(
     int& global_size_row,
     int& local_size_row,
     int& local_size_col) {
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int maxWGSize = syclMaxWorkGroupSize(dev_id);
+  int maxWGSize = syclMaxWorkGroupSize<KernelClass>();
 
   int local_size = (dim_size + NUM * vec_size - 1) / (NUM * vec_size);
   local_size = std::min(local_size, maxWGSize);
@@ -163,16 +162,15 @@ static inline void get_wgroup_size(
 }
 
 // this method help to divide the computation resource for spatial_softmax
-template <int vec_size>
+template <int vec_size, class KernelClass>
 static inline void get_wgroup_size_spatial(
     int bs,
     int dim_size,
     int inner_size,
     int& GroupSize,
     int& GroupRow) {
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int maxWGSize = syclMaxWorkGroupSize(dev_id);
-  int total_resource = syclMaxWorkItemsPerTile(dev_id);
+  int maxWGSize = syclMaxWorkGroupSize<KernelClass>();
+  int total_resource = syclMaxWorkItemsPerTile();
 
   // set the GroupSize smaller to ensure larger group number
   // smaller GroupSize is friendly to the tail case
@@ -389,22 +387,11 @@ void dispatch_softmax_forward_kernel(
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   auto& queue = getCurrentSYCLQueue();
 
-  int sub_group_num, global_size_row, local_size_row, range, local_size;
-  get_wgroup_size<SIMD, vec_size, outer_loop>(
-      dim_size,
-      outer_size,
-      sub_group_num,
-      range,
-      global_size_row,
-      local_size_row,
-      local_size);
-  int64_t local_range{local_size_row * local_size};
-  int64_t global_range{global_size_row * local_size_row * local_size};
   scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
   scalar_t nan = std::numeric_limits<accscalar_t>::quiet_NaN();
 
   if constexpr (is_masked) {
-    auto caller = DispatchSoftmaxForwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxForwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -415,7 +402,21 @@ void dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         calc_t,
-        vec_t>(
+        vec_t>;
+
+    int sub_group_num, global_size_row, local_size_row, range, local_size;
+    get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    auto kfn = KernelClass(
         in_data,
         out_data,
         dim_size,
@@ -429,10 +430,10 @@ void dispatch_softmax_forward_kernel(
         local_size,
         neginf,
         nan);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
     DummyFunctor dummy;
-    auto caller = DispatchSoftmaxForwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxForwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -443,7 +444,21 @@ void dispatch_softmax_forward_kernel(
         outer_loop,
         is_masked,
         DummyFunctor,
-        vec_t>(
+        vec_t>;
+
+    int sub_group_num, global_size_row, local_size_row, range, local_size;
+    get_wgroup_size<SIMD, vec_size, outer_loop, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    auto kfn = KernelClass(
         in_data,
         out_data,
         dim_size,
@@ -457,7 +472,7 @@ void dispatch_softmax_forward_kernel(
         local_size,
         neginf,
         nan);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
 }
 
@@ -580,22 +595,25 @@ void softmax_forward_kernel(
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   constexpr int align_bytes = alignof(vec_t);
-  auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int local_size = std::min(
-      (dim_size + vec_size - 1) / vec_size, int(syclMaxWorkGroupSize(dev_id)));
-
-  int64_t local_range{local_size};
-  int64_t global_range{local_size * outer_size};
-  auto ker = SoftmaxForwardKernelFunctor<
+  using KernelClass = SoftmaxForwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
       IndexType,
       LogSoftMax,
       vec_t,
-      align_bytes>(in_data, out_data, dim_size, outer_size, local_size);
-  sycl_kernel_submit(global_range, local_range, queue, ker);
+      align_bytes>;
+
+  int local_size = std::min(
+      (dim_size + vec_size - 1) / vec_size,
+      int(syclMaxWorkGroupSize<KernelClass>()));
+  int64_t local_range{local_size};
+  int64_t global_range{local_size * outer_size};
+
+  auto kfn = KernelClass(in_data, out_data, dim_size, outer_size, local_size);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -754,10 +772,16 @@ void spatial_softmax_forward(
     int inner_size,
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
-  auto& queue = getCurrentSYCLQueue();
+  using KernelClass = SpatialSoftmaxForwardKernelFunctor<
+      vec_size,
+      scalar_t,
+      accscalar_t,
+      IndexType,
+      LogSoftMax,
+      vec_t>;
 
   int local_size, block_row;
-  get_wgroup_size_spatial<vec_size>(
+  get_wgroup_size_spatial<vec_size, KernelClass>(
       outer_size, dim_size, inner_size, local_size, block_row);
   int group_num =
       (inner_size + local_size * vec_size - 1) / (local_size * vec_size);
@@ -765,7 +789,7 @@ void spatial_softmax_forward(
       (size_t)outer_size, (size_t)block_row, (size_t)(group_num * local_size)};
   sycl::range<3> local_range{(size_t)1, (size_t)block_row, (size_t)local_size};
 
-  auto caller = SpatialSoftmaxForwardKernelFunctor<
+  auto kfn = SpatialSoftmaxForwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
@@ -780,7 +804,9 @@ void spatial_softmax_forward(
       local_size,
       block_row,
       group_num);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -941,20 +967,10 @@ void dispatch_softmax_backward_kernel(
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   auto& queue = getCurrentSYCLQueue();
   constexpr int NUM = INNER_LOOP / vec_size * (SIMD32 / SIMD);
-  int sub_group_num, global_size_row, local_size_row, range, local_size;
-  get_wgroup_size<SIMD, vec_size, NUM>(
-      dim_size,
-      outer_size,
-      sub_group_num,
-      range,
-      global_size_row,
-      local_size_row,
-      local_size);
-  int64_t local_range{local_size_row * local_size};
-  int64_t global_range{global_size_row * local_size_row * local_size};
 
+  int sub_group_num, global_size_row, local_size_row, range, local_size;
   if constexpr (is_masked) {
-    auto caller = DispatchSoftmaxBackwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxBackwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -965,7 +981,18 @@ void dispatch_softmax_backward_kernel(
         is_masked,
         calc_t,
         vec_t,
-        NUM>(
+        NUM>;
+
+    get_wgroup_size<SIMD, vec_size, NUM, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+
+    auto kfn = KernelClass(
         gradInput,
         output,
         gradOutput,
@@ -978,10 +1005,14 @@ void dispatch_softmax_backward_kernel(
         local_size_row,
         range,
         local_size);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   } else {
     DummyFunctor dummy;
-    auto caller = DispatchSoftmaxBackwardKernelFunctor<
+    using KernelClass = DispatchSoftmaxBackwardKernelFunctor<
         INNER_LOOP,
         vec_size,
         SIMD,
@@ -992,7 +1023,18 @@ void dispatch_softmax_backward_kernel(
         is_masked,
         DummyFunctor,
         vec_t,
-        NUM>(
+        NUM>;
+
+    get_wgroup_size<SIMD, vec_size, NUM, KernelClass>(
+        dim_size,
+        outer_size,
+        sub_group_num,
+        range,
+        global_size_row,
+        local_size_row,
+        local_size);
+
+    auto kfn = KernelClass(
         gradInput,
         output,
         gradOutput,
@@ -1005,7 +1047,11 @@ void dispatch_softmax_backward_kernel(
         local_size_row,
         range,
         local_size);
-    sycl_kernel_submit(global_range, local_range, queue, caller);
+
+    int64_t local_range{local_size_row * local_size};
+    int64_t global_range{global_size_row * local_size_row * local_size};
+
+    sycl_kernel_submit(global_range, local_range, queue, kfn);
   }
 }
 
@@ -1127,24 +1173,25 @@ void softmax_backward_kernel(
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
   constexpr int align_bytes = alignof(vec_t);
-  auto& queue = getCurrentSYCLQueue();
-
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int local_size = std::min(
-      (dim_size + vec_size - 1) / vec_size, int(syclMaxWorkGroupSize(dev_id)));
-  int64_t local_range{local_size};
-  int64_t global_range{local_size * outer_size};
-
-  auto caller = SoftmaxBackwardKernelFunctor<
+  using KernelClass = SoftmaxBackwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
       LogSoftMax,
       vec_t,
-      align_bytes>(
+      align_bytes>;
+
+  int local_size = std::min(
+      (dim_size + vec_size - 1) / vec_size,
+      int(syclMaxWorkGroupSize<KernelClass>()));
+  int64_t local_range{local_size};
+  int64_t global_range{local_size * outer_size};
+
+  auto kfn = KernelClass(
       gradInput, output, gradOutput, dim_size, outer_size, local_size);
 
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <
@@ -1271,10 +1318,15 @@ void spatial_softmax_backward_kernel(
     int inner_size,
     int outer_size) {
   using vec_t = at::native::memory::aligned_vector<scalar_t, vec_size>;
-  auto& queue = getCurrentSYCLQueue();
+  using KernelClass = SpatialSoftmaxBackwardKernelFunctor<
+      vec_size,
+      scalar_t,
+      accscalar_t,
+      LogSoftMax,
+      vec_t>;
 
   int local_size, block_row;
-  get_wgroup_size_spatial<vec_size>(
+  get_wgroup_size_spatial<vec_size, KernelClass>(
       outer_size, dim_size, inner_size, local_size, block_row);
   int group_num =
       (inner_size + local_size * vec_size - 1) / (local_size * vec_size);
@@ -1282,7 +1334,7 @@ void spatial_softmax_backward_kernel(
       (size_t)outer_size, (size_t)block_row, (size_t)(group_num * local_size)};
   sycl::range<3> local_range{(size_t)1, (size_t)block_row, (size_t)local_size};
 
-  auto caller = SpatialSoftmaxBackwardKernelFunctor<
+  auto kfn = SpatialSoftmaxBackwardKernelFunctor<
       vec_size,
       scalar_t,
       accscalar_t,
@@ -1296,7 +1348,9 @@ void spatial_softmax_backward_kernel(
       outer_size,
       local_size,
       block_row);
-  sycl_kernel_submit(global_range, local_range, queue, caller);
+
+  auto& queue = getCurrentSYCLQueue();
+  sycl_kernel_submit(global_range, local_range, queue, kfn);
 }
 
 template <typename scalar_t, typename accscalar_t, bool LogSoftMax>
@@ -1322,8 +1376,8 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
       canUse32BitIndexMath(input) && canUse32BitIndexMath(output);
 
   // decide SIMD: SIMD32 or SIMD16
-  auto* dev_prop =
-      at::xpu::getDeviceProperties(at::xpu::getDeviceIndexOfCurrentQueue());
+  auto dev_id = at::xpu::getDeviceIndexOfCurrentQueue();
+  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   auto sub_group_size = dev_prop->sub_group_sizes;
   int SIMD = sub_group_size[1];
   if (SIMD == SIMD32) {
@@ -1381,8 +1435,27 @@ void spatial_softmax_forward(Tensor& output, Tensor& input, int dim) {
     // if the element number is smaller than max_work_group_size * INNER_LOOP,
     // the fast path (dispatch_softmax_forward) will be selected.
     // otherwise, the general path (softmax_forward_kernel) will be selected.
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_group_size = syclMaxWorkGroupSize(dev_id);
+
+    // Query the smallest max work group size of the kernel template. The kernel
+    // instance with the largest register pressure will have the smallest max
+    // work group size. Memory spill probably occurs more severely than
+    // any other instances, then compiler probably chooses less SIMD width to
+    // mitgate register pressure. Actual max work group size of these kernel
+    // template allowed by the compiler is less than device allowed max work
+    // group size.
+    using DispatchSoftmaxForwardKernel = DispatchSoftmaxForwardKernelFunctor<
+        INNER_LOOP,
+        max_vec_size,
+        SIMD32,
+        scalar_t,
+        accscalar_t,
+        uint32_t,
+        LogSoftMax,
+        INNER_LOOP / max_vec_size,
+        false,
+        DummyFunctor,
+        vec_t>;
+    int max_group_size = syclMaxWorkGroupSize<DispatchSoftmaxForwardKernel>();
 
     if (can_use_32bit_index && max_group_size * INNER_LOOP >= dim_size) {
       // it assumes vec_size * outer_loop * work_group_size >= dim_size
@@ -1544,8 +1617,29 @@ void spatial_softmax_backward(
       outer_size);
 
   if (inner_size == 1) {
-    auto dev_id = getDeviceIndexOfCurrentQueue();
-    int max_group_size = syclMaxWorkGroupSize(dev_id);
+    // Query the smallest max work group size of the kernel template. The kernel
+    // instance with the largest register pressure will have the smallest max
+    // work group size. Memory spill probably occurs more severely than
+    // any other instances, then compiler probably chooses less SIMD width to
+    // mitgate register pressure. Actual max work group size of these kernel
+    // template allowed by the compiler is less than device allowed max work
+    // group size.
+    constexpr int NUM = INNER_LOOP / max_vec_size /* * (SIMD32 / SIMD32) */;
+    using DispatchSoftmaxBackwardKernel = DispatchSoftmaxBackwardKernelFunctor<
+        INNER_LOOP,
+        max_vec_size,
+        SIMD32,
+        scalar_t,
+        accscalar_t,
+        uint32_t,
+        LogSoftMax,
+        false, /* No instance for true */
+        DummyFunctor,
+        vec_t,
+        NUM>;
+
+    int max_group_size = syclMaxWorkGroupSize<DispatchSoftmaxBackwardKernel>();
+
     // if the element number is smaller than max_work_group_size * INNER_LOOP
     // / 2, (2 indicates reading two tensors: output and gradOutput) the fast
     // path (dispatch_softmax_backward) will be selected. otherwise, the
diff --git a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
index 99b4beeb3..ecf832de3 100644
--- a/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorTransformationsKernels.cpp
@@ -50,10 +50,11 @@ struct ElementwiseKernelFunctor {
 
 template <typename func_t>
 void elementwise_kernel(int total_n_elems, func_t f) {
+  using KernelClass = ElementwiseKernelFunctor<func_t>;
+
   auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
-  int64_t max_wg_size = syclMaxWorkGroupSize(dev_id);
-  const auto target_global_size = syclMaxWorkItemsPerTile(dev_id);
+  int64_t max_wg_size = syclMaxWorkGroupSize<KernelClass>();
+  const auto target_global_size = syclMaxWorkItemsPerTile();
   int work_group_size =
       total_n_elems > max_wg_size ? max_wg_size : total_n_elems;
   const int max_work_group_num = target_global_size / work_group_size;
@@ -66,8 +67,7 @@ void elementwise_kernel(int total_n_elems, func_t f) {
 
   int total_work_items = work_group_size * work_group_num;
 
-  ElementwiseKernelFunctor<func_t> kfn(
-      loops, total_n_elems, f, total_work_items);
+  KernelClass kfn(loops, total_n_elems, f, total_work_items);
 
   sycl_kernel_submit(
       sycl::range<1>(total_work_items),
@@ -205,12 +205,14 @@ void roll_template(
     int64_t size,
     int64_t stride,
     int64_t total_dims) {
+  using KernelClass = RollKernelFunctor<scalar_t>;
+
   auto shift = size - start;
   auto offset = shift * stride;
   auto start_offset = start * stride;
   auto total_offset = size * stride;
 
-  auto local_range = syclMaxWorkGroupSize();
+  auto local_range = syclMaxWorkGroupSize<KernelClass>();
   const auto target_global_range =
       syclMaxWorkItemsPerTile() / local_range * local_range;
   int global_range = (N + local_range - 1) / local_range * local_range;
@@ -221,7 +223,7 @@ void roll_template(
 
   auto in_data = in_tensor.data_ptr<scalar_t>();
   auto out_data = out_tensor.data_ptr<scalar_t>();
-  RollKernelFunctor<scalar_t> kfn(
+  KernelClass kfn(
       in_data,
       out_data,
       val_of_work_item,
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
index fb63f19b6..07fd0be0a 100644
--- a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -72,12 +72,7 @@ struct ApplyTriuTrilKernelFunctor {
 
 template <typename scalar_t, typename IndexType, bool upper>
 void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
-  auto& queue = getCurrentSYCLQueue();
-  auto dev_id = getDeviceIndexOfCurrentQueue();
   auto N = self.numel();
-  int64_t group_size = syclMaxWorkGroupSize(dev_id);
-  auto num_groups = ceil_div(N, group_size);
-  auto total_items = num_groups * group_size;
   IndexType self_size_0 = (IndexType)self.size(-2);
   IndexType self_size_1 = (IndexType)self.size(-1);
   IndexType self_stride = (IndexType)(self.dim() > 2 ? self.stride(-3) : 1);
@@ -105,6 +100,11 @@ void apply_triu_tril(Tensor& result, const Tensor& self, const int64_t k) {
       result_ptr,
       self_ptr);
 
+  int64_t group_size = syclMaxWorkGroupSize(kfn);
+  auto num_groups = ceil_div(N, group_size);
+  auto total_items = num_groups * group_size;
+  auto& queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(total_items), sycl::range<1>(group_size), queue, kfn);
 }
@@ -157,4 +157,4 @@ Tensor& triu_kernel(Tensor& result, const Tensor& self, int64_t k) {
   return result;
 }
 
-} // namespace at::native::xpu
\ No newline at end of file
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
index 504f28d7b..275c4f2dc 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.cpp
@@ -125,12 +125,13 @@ static void upsample_bicubic2d_out_template(
     bool align_corners,
     const accscalar_t height_scale,
     const accscalar_t width_scale) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-  int64_t num_wg = at::ceil_div(onum, wg_size);
-
   UpsampleBicubic2dKernelFunctor<scalar_t, accscalar_t> kfn(
       odata, idata, onum, align_corners, height_scale, width_scale);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int64_t num_wg = at::ceil_div(onum, wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(num_wg * wg_size, wg_size, queue, kfn);
 }
 
diff --git a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
index 1ab02435a..5ad95e0c2 100644
--- a/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UpSampleBilinear2dKernels.cpp
@@ -107,10 +107,6 @@ void launch_upsample_bilinear2d_kernel(
     int64_t output_width,
     int64_t nbatch,
     int64_t channels) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-  int num_group = at::ceil_div(n, (int)wg_size);
-
   UpsampleBilinear2dKernelFunctor<scalar_t, accscalar_t> kfn(
       n,
       rheight,
@@ -125,6 +121,10 @@ void launch_upsample_bilinear2d_kernel(
       nbatch,
       channels);
 
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int num_group = at::ceil_div(n, (int)wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
 }
@@ -249,14 +249,10 @@ void launch_upsample_bilinear2d_backward_kernel(
     const bool align_corners,
     scalar_t* idata,
     const scalar_t* odata) {
-  auto queue = getCurrentSYCLQueue();
-  int64_t wg_size = syclMaxWorkGroupSize();
-
   const size_t o_numel = nc * output_width * output_height;
   const size_t i_numel = nc * input_width * input_height;
 
   const size_t num_kernels = nc * output_width * output_height;
-  int num_group = at::ceil_div((int64_t)num_kernels, (int64_t)wg_size);
 
   UpsampleBilinear2dBackwardKernelFunctor<scalar_t, accscalar_t> kfn(
       nc,
@@ -273,6 +269,11 @@ void launch_upsample_bilinear2d_backward_kernel(
       odata,
       o_numel,
       i_numel);
+
+  int64_t wg_size = syclMaxWorkGroupSize(kfn);
+  int num_group = at::ceil_div((int64_t)num_kernels, (int64_t)wg_size);
+  auto queue = getCurrentSYCLQueue();
+
   sycl_kernel_submit(
       sycl::range<1>(num_group * wg_size), sycl::range<1>(wg_size), queue, kfn);
 }
diff --git a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
index 68f42ec88..96a2452b2 100644
--- a/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
+++ b/src/ATen/native/xpu/sycl/pstl/PSTLFunctions.h
@@ -174,31 +174,35 @@ static inline OutputIt _scan_kernel(
     InputIt last,
     OutputIt d_first,
     T init) {
+  using KSScanKernel = KSScanKernelFunctor<scan_type, InputIt, OutputIt, T>;
+  using KSScanWithCarrierKernel =
+      KSScanWithCarrierKernelFunctor<scan_type, InputIt, OutputIt, T>;
+
   const auto N = std::distance(first, last);
   auto& q = getCurrentSYCLQueue();
-  const auto wgroup_size = syclMaxWorkGroupSize();
-  const auto ngroups = (N + wgroup_size - 1) / wgroup_size;
+  const auto kss_wgroup_size = syclMaxWorkGroupSize<KSScanKernel>();
 
   auto options = map_options<T>();
 
-  if (N <= wgroup_size) {
+  if (N <= kss_wgroup_size) {
     // Kogge-Stone addr algorithm;
-    KSScanKernelFunctor<scan_type, InputIt, OutputIt, T> kfn1(
-        first, init, N, d_first);
+    KSScanKernel kfn1(first, init, N, d_first);
     sycl_kernel_submit(sycl::range<1>(N), sycl::range<1>(N), q, kfn1);
 
     return d_first + N;
   }
 
+  const auto kssc_wgroup_size = syclMaxWorkGroupSize<KSScanWithCarrierKernel>();
+  auto ngroups = (N + kssc_wgroup_size - 1) / kssc_wgroup_size;
   Tensor carry = at::empty({ngroups}, options);
   T* carry_ptr = carry.data_ptr<T>();
 
   // 1. do exclusive_scan on each workgroups
-  KSScanWithCarrierKernelFunctor<scan_type, InputIt, OutputIt, T> kfn2(
-      first, init, N, carry_ptr, wgroup_size, d_first);
+  KSScanWithCarrierKernel kfn2(
+      first, init, N, carry_ptr, kssc_wgroup_size, d_first);
   sycl_kernel_submit(
-      sycl::range<1>(ngroups * wgroup_size),
-      sycl::range<1>(wgroup_size),
+      sycl::range<1>(ngroups * kssc_wgroup_size),
+      sycl::range<1>(kssc_wgroup_size),
       q,
       kfn2);
 
@@ -207,9 +211,13 @@ static inline OutputIt _scan_kernel(
 
   // 3. reduce among all work groups and flush data to dst
   ScanAccumulateKernelFunctor<OutputIt, T> kfn3(d_first, carry_ptr, N);
+
+  const auto sa_wgroup_size = syclMaxWorkGroupSize(kfn3);
+  ngroups = (N + sa_wgroup_size - 1) / sa_wgroup_size;
+
   sycl_kernel_submit(
-      sycl::range<1>(ngroups * wgroup_size),
-      sycl::range<1>(wgroup_size),
+      sycl::range<1>(ngroups * sa_wgroup_size),
+      sycl::range<1>(sa_wgroup_size),
       q,
       kfn3);
 
diff --git a/src/comm/DeviceProperties.h b/src/comm/DeviceProperties.h
index 5597248fa..0f4c084c8 100644
--- a/src/comm/DeviceProperties.h
+++ b/src/comm/DeviceProperties.h
@@ -3,11 +3,34 @@
 #include <ATen/xpu/XPUContext.h>
 
 #include <comm/Runtime.h>
+#include <iostream>
 
 namespace xpu {
 namespace sycl {
 
-static inline int64_t syclMaxWorkGroupSize(
+template <class KernelClass>
+static int64_t syclMaxWorkGroupSize(
+    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+  auto q = c10::xpu::getCurrentXPUStream(dev_id).queue();
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  auto kid = ::sycl::get_kernel_id<KernelClass>();
+  auto kbundle =
+      ::sycl::get_kernel_bundle<::sycl::bundle_state::executable>(ctx, {kid});
+
+  ::sycl::kernel k = kbundle.get_kernel(kid);
+  return k.get_info<::sycl::info::kernel_device_specific::work_group_size>(dev);
+}
+
+template <class KernelClass>
+static int64_t syclMaxWorkGroupSize(
+    KernelClass /*kfn*/,
+    at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
+  return syclMaxWorkGroupSize<KernelClass>(dev_id);
+}
+
+static inline int64_t syclDeviceMaxWorkGroupSize(
     at::DeviceIndex dev_id = at::xpu::getDeviceIndexOfCurrentQueue()) {
   auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
   return dev_prop->max_work_group_size;