diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index 4a27034dc..f79442bc2 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -530,6 +530,71 @@ Tensor& XPUNativeFunctions::sigmoid_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor XPUNativeFunctions::sign(const Tensor& self) {
+  TORCH_CHECK(
+      !self.is_complex(),
+      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(out, self);
+  native::xpu::sign_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::sign_(Tensor& self) {
+  TORCH_CHECK(
+      !self.is_complex(),
+      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(self, self);
+  native::xpu::sign_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::sign_out(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(
+      !self.is_complex(),
+      "Unlike NumPy, torch.sign is not intended to support complex numbers. Please use torch.sgn instead.");
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(out, self);
+  native::xpu::sign_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::signbit(const Tensor& self) {
+  TORCH_CHECK(
+      !self.is_complex(), "signbit is not implemented for complex tensors.");
+
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_force_boolean_op(out, self);
+
+  if (self.dtype() == at::kBool) {
+    iter.output().fill_(false);
+  } else {
+    native::xpu::signbit_kernel(iter);
+  }
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::signbit_out(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(
+      !self.is_complex(), "signbit is not implemented for complex tensors.");
+  TORCH_CHECK(
+      out.dtype() == at::kBool,
+      "signbit does not support non-boolean outputs.");
+
+  TensorIterator iter;
+  iter.build_borrowing_unary_force_boolean_op(out, self);
+
+  if (self.dtype() == at::kBool) {
+    out.fill_(false);
+  } else {
+    native::xpu::signbit_kernel(iter);
+  }
+  return out;
+}
+
 Tensor& XPUNativeFunctions::logit_out(
     const Tensor& self,
     std::optional<double> eps,
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 3445f5e42..5b2d6e5ff 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -245,8 +245,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_scaled_mm",
     "segment_reduce",
     "_segment_reduce_backward",
-    "signbit.out",
-    "sign.out",
     "sinc.out",
     "special_airy_ai.out",
     "special_bessel_j0.out",
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
index 10f7f0eec..25acd873a 100644
--- a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -263,7 +263,8 @@ static inline void group_reduce(
   // uint32_t SIMD = sg.get_local_range()[0];
 #pragma unroll
   for (int i = 1; i < SIMD; i <<= 1) {
-    val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+    val = bin_op(
+        val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
   }
   if (sub_group_num == 1) {
     if (lane_id == 0) {
@@ -294,7 +295,8 @@ static inline void group_reduce(
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+      val = bin_op(
+          val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
       if (i >= ((sub_group_num + 1) >> 1))
         break;
     }
@@ -450,10 +452,10 @@ struct BatchNormCollectStatisticsKernelFunctor
     // one value per subgroup
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      stat_accscalar_t o_avg = sg.shuffle_xor(avg, i);
-      int o_n = sg.shuffle_xor(n, i);
+      stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i);
+      int o_n = sycl::permute_group_by_xor(sg, n, i);
       stat_accscalar_t factor = 1.0 / fmaxf(1.0, n + o_n);
-      var_n += sg.shuffle_xor(var_n, i) +
+      var_n += sycl::permute_group_by_xor(sg, var_n, i) +
           (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
       avg = (n * avg + o_n * o_avg) * factor;
       n += o_n;
@@ -481,10 +483,10 @@ struct BatchNormCollectStatisticsKernelFunctor
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      stat_accscalar_t o_avg = sg.shuffle_xor(avg, i);
-      int o_n = sg.shuffle_xor(n, i);
+      stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i);
+      int o_n = sycl::permute_group_by_xor(sg, n, i);
       stat_accscalar_t factor = 1.0f / fmaxf(1.0f, n + o_n);
-      var_n += sg.shuffle_xor(var_n, i) +
+      var_n += sycl::permute_group_by_xor(sg, var_n, i) +
           (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
       avg = (n * avg + o_n * o_avg) * factor;
       n += o_n;
diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
index 8bd61bdd3..eb0f1f50e 100644
--- a/src/ATen/native/xpu/sycl/DistanceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
@@ -120,7 +120,7 @@ scalar_t subgroup_reduce_agg_without_broadcast_impl(
 
 #pragma unroll
   for (int offset = (SG_SIZE >> 1); offset > 0; offset >>= 1) {
-    F::agg(value, sg.shuffle_down(value, offset));
+    F::agg(value, sycl::shift_group_left(sg, value, offset));
   }
   return value;
 }
diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
index 622e99ffe..8dafdc8d2 100644
--- a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
+++ b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -3,10 +3,10 @@
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/GroupReduceUtils.h>
 #include <ATen/native/xpu/sycl/Loops.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 #include <comm/MemoryFormat.h>
 #include <comm/XPUMathCompat.h>
 
@@ -18,23 +18,23 @@ template <
     typename index_t,
     typename res_t>
 struct WelfordOpsXPU
-    : public at::native::WelfordOps<scalar_t, acc_scalar_t, index_t, res_t> {
+    : public WelfordOps<scalar_t, acc_scalar_t, index_t, res_t> {
   sycl::nd_item<1>& item;
 
  public:
-  using acc_t = typename at::native::
-      WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>::acc_t;
+  using acc_t =
+      typename WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>::acc_t;
   inline acc_t shfl_down(acc_t acc, int offset) const {
     auto sg = item.get_sub_group();
     return {
-        sg.shuffle_down(acc.mean, offset),
-        sg.shuffle_down(acc.m2, offset),
-        sg.shuffle_down(acc.n, offset),
-        sg.shuffle_down(acc.nf, offset)};
+        sycl::shift_group_left(sg, acc.mean, offset),
+        sycl::shift_group_left(sg, acc.m2, offset),
+        sycl::shift_group_left(sg, acc.n, offset),
+        sycl::shift_group_left(sg, acc.nf, offset)};
   }
 
   WelfordOpsXPU(acc_scalar_t correction, bool take_sqrt, sycl::nd_item<1>& item)
-      : at::native::WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>(
+      : WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>(
             correction,
             take_sqrt),
         item(item) {}
@@ -43,7 +43,7 @@ struct WelfordOpsXPU
 template <typename T, int SIMD>
 struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
-  using WelfordType = at::native::WelfordData<T_ACC, int64_t>;
+  using WelfordType = WelfordData<T_ACC, int64_t>;
   using WelfordOp =
       WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
 
diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
index 9aee941cb..36d1282a3 100644
--- a/src/ATen/native/xpu/sycl/Norm.h
+++ b/src/ATen/native/xpu/sycl/Norm.h
@@ -39,8 +39,10 @@ static inline void norm_group_reduce(
   // uint32_t SIMD = sg.get_local_range()[0];
 #pragma unroll
   for (int i = 1; i < SIMD; i <<= 1) {
-    sum1 = bin_op(sum1, static_cast<accscalar_t>(sg.shuffle_down(sum1, i)));
-    sum2 = bin_op(sum2, static_cast<accscalar_t>(sg.shuffle_down(sum2, i)));
+    sum1 = bin_op(
+        sum1, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum1, i)));
+    sum2 = bin_op(
+        sum2, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum2, i)));
   }
   if (sub_group_num == 1) {
     sum1 = sycl::group_broadcast(sg, sum1, 0);
@@ -73,8 +75,10 @@ static inline void norm_group_reduce(
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      sum1 = bin_op(sum1, static_cast<accscalar_t>(sg.shuffle_down(sum1, i)));
-      sum2 = bin_op(sum2, static_cast<accscalar_t>(sg.shuffle_down(sum2, i)));
+      sum1 = bin_op(
+          sum1, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum1, i)));
+      sum2 = bin_op(
+          sum2, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum2, i)));
       if (i >= ((sub_group_num + 1) >> 1))
         break;
     }
diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
index 1be3a5e93..276b21175 100644
--- a/src/ATen/native/xpu/sycl/Reduce.h
+++ b/src/ATen/native/xpu/sycl/Reduce.h
@@ -13,6 +13,7 @@
 #include <c10/macros/Macros.h>
 #include <comm/DeviceProperties.h>
 #include <comm/SYCLContext.h>
+#include <comm/XPUPair.h>
 #include <functional>
 #include <iosfwd>
 #include <type_traits>
@@ -50,7 +51,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_reduce(
   for (int offset = 1; offset < sg_size; offset <<= 1) {
 #pragma unroll(out_vec_sz)
     for (int i = 0; i < out_vec_sz; ++i) {
-      arg_t other = sg.shuffle_down(value[i], offset);
+      arg_t other = sycl::shift_group_left(sg, value[i], offset);
       value[i] = combine(value[i], other);
     }
   }
@@ -71,7 +72,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_reduce(
       for (int offset = 1; offset < sg_range; offset <<= 1) {
 #pragma unroll(out_vec_sz)
         for (int i = 0; i < out_vec_sz; ++i) {
-          arg_t other = sg.shuffle_down(value[i], offset);
+          arg_t other = sycl::shift_group_left(sg, value[i], offset);
           value[i] = combine(value[i], other);
         }
       }
@@ -132,7 +133,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_x_reduce(
   for (int offset = 1; offset < dim_x; offset <<= 1) {
 #pragma unroll(out_vec_sz)
     for (int i = 0; i < out_vec_sz; ++i) {
-      arg_t other = sg.shuffle_down(value[i], offset);
+      arg_t other = sycl::shift_group_left(sg, value[i], offset);
       value[i] = combine(value[i], other);
     }
   }
@@ -541,11 +542,11 @@ struct ReduceOp {
           (const scalar_t*)((const char*)src + base_offsets1);
       value = item_reduce<output_vec_size>(pos, input_slice);
     }
-    // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-    // pair for half dtype, We temporarily workaround to do
+    // TODO: Currently, there are bugs with sycl::shift_group_left when the
+    // arg_t is a pair for half dtype, We temporarily workaround to do
     // "reduce_for_compound_dtype" function.
     constexpr bool is_pair =
-        std::is_same<std::pair<scalar_t, int64_t>, arg_t>::value;
+        std::is_same<at::xpu::pair<scalar_t, int64_t>, arg_t>::value;
 
     auto combine = [=](arg1_t value, arg2_t other) -> arg1_t {
       return ops.combine(value, other);
@@ -832,8 +833,8 @@ struct ReduceOp {
     return value_list[0];
   }
 
-  // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-  // pair with half dtype, We temporarily workaround to do
+  // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t
+  // is a pair with half dtype, We temporarily workaround to do
   // "reduce_for_compound_dtype" function.
   template <int output_vec_size>
   at::detail::Array<arg_t, output_vec_size> group_reduce_for_compound_dtype(
@@ -850,7 +851,7 @@ struct ReduceOp {
     for (int offset = 1; offset < (int)sbgrpSize; offset <<= 1) {
 #pragma unroll(output_vec_size)
       for (int i = 0; i < output_vec_size; ++i) {
-        arg_t other = sg.shuffle_down(value[i], offset);
+        arg_t other = sycl::shift_group_left(sg, value[i], offset);
         value[i] = ops.combine(value[i], other);
       }
     }
@@ -875,12 +876,13 @@ struct ReduceOp {
 #pragma unroll(output_vec_size)
           for (int i = 0; i < output_vec_size; ++i) {
             // Shuffle down separately for first and second pair.
-            std::pair<typename arg_t::first_type, typename arg_t::second_type>
-                other = std::pair<
-                    typename arg_t::first_type,
-                    typename arg_t::second_type>(
-                    sg.shuffle_down(value[i].first, offset),
-                    sg.shuffle_down(value[i].second, offset));
+            at::xpu::
+                pair<typename arg_t::first_type, typename arg_t::second_type>
+                    other = at::xpu::pair<
+                        typename arg_t::first_type,
+                        typename arg_t::second_type>(
+                        sycl::shift_group_left(sg, value[i].first, offset),
+                        sycl::shift_group_left(sg, value[i].second, offset));
             value[i] = ops.combine(value[i], other);
           }
         }
@@ -907,8 +909,8 @@ struct ReduceOp {
     return value;
   }
 
-  // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-  // pair for half dtype, We temporarily workaround to do
+  // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t
+  // is a pair for half dtype, We temporarily workaround to do
   // "reduce_for_compound_dtype" function.
   template <int output_vec_size>
   at::detail::Array<arg_t, output_vec_size> group_x_reduce_for_compound_dtype(
@@ -947,11 +949,11 @@ struct ReduceOp {
     for (int offset = 1; offset < dim_x; offset <<= 1) {
 #pragma unroll(output_vec_size)
       for (int i = 0; i < output_vec_size; ++i) {
-        std::pair<typename arg_t::first_type, typename arg_t::second_type>
-            other = std::
+        at::xpu::pair<typename arg_t::first_type, typename arg_t::second_type>
+            other = xpu::
                 pair<typename arg_t::first_type, typename arg_t::second_type>(
-                    sg.shuffle_down(value[i].first, offset),
-                    sg.shuffle_down(value[i].second, offset));
+                    sycl::shift_group_left(sg, value[i].first, offset),
+                    sycl::shift_group_left(sg, value[i].second, offset));
         value[i] = ops.combine(value[i], other);
       }
     }
@@ -1028,7 +1030,8 @@ struct ReduceOp {
 
   // Currently implemented for max of two outputs
   template <class T1, class T2>
-  void set_results(const std::pair<T1, T2> x, const index_t base_offset) const {
+  void set_results(const at::xpu::pair<T1, T2> x, const index_t base_offset)
+      const {
     if (noutputs >= 1) {
       auto res0 = (T1*)((char*)dst[0] + base_offset);
       *res0 = x.first;
@@ -1121,9 +1124,10 @@ struct ReduceOp {
           decltype(combine),
           output_vec_size>(pos, shared_memory, value, combine);
       if (config.should_group_x_reduce()) {
-        // TODO: workaround because sg.shuffle_down will fail on `half` dtype.
+        // TODO: workaround because sycl::shift_group_left will fail on `half`
+        // dtype.
         constexpr bool is_pair =
-            std::is_same<std::pair<scalar_t, int64_t>, arg_t>::value;
+            std::is_same<at::xpu::pair<scalar_t, int64_t>, arg_t>::value;
         if constexpr (is_pair) {
           value = group_x_reduce_for_compound_dtype<output_vec_size>(
               pos, value, shared_memory);
diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
index 04c847a4d..121761053 100644
--- a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at::native::xpu {
 
@@ -12,7 +12,7 @@ void _min_max_values_kernel_xpu_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
       iter,
       MinMaxOps<scalar_t, scalar_t, int32_t>{},
-      std::pair<scalar_t, scalar_t>(
+      at::xpu::pair<scalar_t, scalar_t>(
           at::numeric_limits<scalar_t>::upper_bound(),
           at::numeric_limits<scalar_t>::lower_bound()));
 }
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
index c41ad0c04..1e18b2e5b 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at {
 namespace native {
@@ -14,7 +14,8 @@ void argmax_kernel_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, int64_t>(
       iter,
       ArgMaxOps<acc_t>{},
-      std::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::lower_bound(), 0));
+      at::xpu::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::lower_bound(), 0));
 };
 
 void argmax_kernel(TensorIterator& iter) {
diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
index 2f6c38152..3c9f8453d 100644
--- a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at::native::xpu {
 
@@ -12,7 +12,8 @@ void argmin_kernel_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, int64_t>(
       iter,
       ArgMinOps<acc_t>{},
-      std::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::upper_bound(), 0));
+      at::xpu::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::upper_bound(), 0));
 };
 
 void argmin_kernel(TensorIterator& iter) {
diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
index 0dfa7fd52..16095056c 100644
--- a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
@@ -1,7 +1,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
@@ -38,7 +38,7 @@ void max_kernel(TensorIterator& iter) {
         gpu_reduce_kernel<scalar_t, scalar_t>(
             iter,
             MaxOps<scalar_t>{},
-            std::pair<scalar_t, int64_t>(
+            at::xpu::pair<scalar_t, int64_t>(
                 at::numeric_limits<scalar_t>::lower_bound(), 0));
       });
 }
diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
index fff7207c6..2a0ce889c 100644
--- a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
@@ -1,6 +1,6 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
@@ -37,7 +37,7 @@ void min_kernel(TensorIterator& iter) {
         gpu_reduce_kernel<scalar_t, scalar_t>(
             iter,
             MinOps<scalar_t>{},
-            std::pair<scalar_t, int64_t>(
+            at::xpu::pair<scalar_t, int64_t>(
                 at::numeric_limits<scalar_t>::upper_bound(), 0));
       });
 }
diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
index 54ae01273..6d0e75680 100644
--- a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
@@ -1,9 +1,9 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at {
 namespace native {
@@ -18,7 +18,7 @@ void std_var_template(
   // This is necessary to lower register usage that leads to register spills.
   using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
   using ops_t =
-      WelfordOps<scalar_t, accscalar_t, int32_t, std::pair<out_t, out_t>>;
+      WelfordOps<scalar_t, accscalar_t, int32_t, at::xpu::pair<out_t, out_t>>;
   ops_t ops(static_cast<accscalar_t>(correction_opt), take_sqrt);
   gpu_reduce_kernel<scalar_t, out_t, 2>(iter, ops, typename ops_t::acc_t{});
 }
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
index 4aac0cceb..658f2e21b 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,5 +1,5 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
 
diff --git a/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h b/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h
index 35f1d54a5..8729f31dd 100644
--- a/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h
+++ b/src/ATen/native/xpu/sycl/SYCLGroupAlgorithm.h
@@ -6,7 +6,7 @@ template <typename T, typename sg_t>
 inline T GroupReduceSumSGSizeEqualstoNumSG(sg_t& sg, T val) {
   auto sg_size = sg.get_local_range()[0];
   for (int offset = (sg_size >> 1); offset > 0; offset >>= 1) {
-    val += sg.shuffle_down(val, offset);
+    val += sycl::shift_group_left(sg, val, offset);
   }
   return val;
 }
diff --git a/src/ATen/native/xpu/sycl/SharedReduceOps.h b/src/ATen/native/xpu/sycl/SharedReduceOps.h
new file mode 100644
index 000000000..0e63cc6ed
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/SharedReduceOps.h
@@ -0,0 +1,416 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/cpu/zmath.h>
+#include <c10/macros/Macros.h>
+#include <comm/XPUPair.h>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#define MAX(X, Y) max_impl(X, Y)
+#define MIN(X, Y) min_impl(X, Y)
+
+#define device_sqrt std::sqrt
+#define compat_pow std::pow
+
+namespace at {
+namespace native {
+namespace xpu {
+
+template <typename scalar_t, typename index_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  scalar_t nf;
+
+  WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  WelfordData(scalar_t mean, scalar_t m2, index_t n, scalar_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+template <
+    typename scalar_t,
+    typename acc_scalar_t,
+    typename index_t,
+    typename res_t>
+struct WelfordOps {
+  acc_scalar_t correction;
+  bool take_sqrt;
+
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t>;
+  inline acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    // We accumulate n in index_t to avoid cumulative rounding error, but still
+    // need nf for use in combine where int32 may overflow.
+    index_t new_n = acc.n + 1;
+    acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
+    acc_scalar_t delta = data - acc.mean;
+    acc_scalar_t new_mean = acc.mean + delta / new_nf;
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+        new_mean,
+        acc.m2 + delta * new_delta,
+        new_n,
+        new_nf,
+    };
+  }
+  inline acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    acc_scalar_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+        a.mean + delta * nb_over_n,
+        a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+        // setting acc.n as -1 since acc.n might not be able to represent the
+        // count correctly within its range, setting it to -1 to avoid confusion
+        -1,
+        new_count};
+  }
+  inline res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
+    const auto mean = static_cast<scalar_t>(acc.mean);
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto var = acc.m2 / divisor;
+    res_t results(take_sqrt ? device_sqrt(var) : var, mean);
+    return results;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+  WelfordOps(acc_scalar_t correction, bool take_sqrt)
+      : correction(correction), take_sqrt(take_sqrt) {}
+};
+
+template <
+    typename scalar_t,
+    typename acc_t = scalar_t,
+    typename factor_t = acc_t,
+    typename out_t = acc_t>
+struct MeanOps {
+  factor_t factor;
+
+  inline acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const {
+    return combine(a, static_cast<acc_t>(b));
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline out_t project(acc_t a) const {
+    return a * factor;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+  MeanOps(factor_t factor) : factor(factor) {}
+};
+
+// This accumulator template is used to calculate the minimum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMinOps {
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MIN(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return MIN(a, b);
+  }
+
+  inline out_t project(acc_t a) const {
+    return a;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+// This accumulator template is used to calculate the maximum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMaxOps {
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MAX(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return MAX(a, b);
+  }
+
+  inline out_t project(acc_t a) const {
+    return a;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+// This accumulator template is used to calculate the norm of the absolute value
+// of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOps {
+  acc_t norm_;
+
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + compat_pow(static_cast<acc_t>(std::abs(data)), norm_);
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline out_t project(acc_t a) const {
+    return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+  NormOps(acc_t norm_) : norm_(norm_) {}
+};
+
+// This accumulator template is used to calculate the order zero norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormZeroOps {
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc +
+        (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0)
+                                          : static_cast<acc_t>(1));
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline out_t project(acc_t a) const {
+    return a;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+// This accumulator template is used to calculate the order one norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOneOps {
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + static_cast<acc_t>(std::abs(data));
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline out_t project(acc_t a) const {
+    return a;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+template <typename acc_t>
+struct AbsSwitch {};
+
+template <typename scalar_t, typename acc_t>
+inline acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(data);
+}
+
+template <typename scalar_t, typename acc_t>
+inline acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+template <typename scalar_t, typename acc_t>
+inline acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+// This accumulator template is used to calculate the order two norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the
+// accumulated value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormTwoOps {
+  inline acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
+    return acc + data_ * data_;
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline out_t project(acc_t a) const {
+    return device_sqrt(a);
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+template <typename acc_t, typename data_t>
+struct NanSumOps {
+  inline acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const {
+    return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b});
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline data_t project(acc_t a) const {
+    return data_t{a};
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+namespace detail {
+
+template <typename scalar_t>
+struct LessOrNan {
+  bool operator()(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else min(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a < b);
+  }
+};
+
+template <typename scalar_t>
+struct GreaterOrNan {
+  bool operator()(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else max(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a > b);
+  }
+};
+
+template <typename comp_t>
+struct MinMaxReductionOps {
+  using scalar_t = typename binary_function_traits<comp_t>::arg1_t;
+  using index_t = int64_t;
+  using arg_t = at::xpu::pair<scalar_t, index_t>;
+
+  static arg_t project(arg_t arg) {
+    return arg;
+  }
+
+  static arg_t reduce(arg_t arg, scalar_t val, int64_t idx) {
+    return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx);
+  }
+
+  static arg_t combine(arg_t a, arg_t b) {
+    return comp_t{}(a.first, b.first, a.second, b.second) ? a : b;
+  }
+
+  static arg_t translate_idx(arg_t a, int64_t base_idx) {
+    return {a.first, a.second + base_idx};
+  }
+};
+
+template <typename comp_t>
+struct ArgReductionOps : public MinMaxReductionOps<comp_t> {
+  using typename MinMaxReductionOps<comp_t>::scalar_t;
+  using typename MinMaxReductionOps<comp_t>::index_t;
+  using typename MinMaxReductionOps<comp_t>::arg_t;
+
+  static index_t project(arg_t arg) {
+    return arg.second;
+  }
+};
+
+} // namespace detail
+
+template <typename scalar_t>
+struct ArgMaxOps
+    : public detail::ArgReductionOps<detail::GreaterOrNan<scalar_t>> {};
+
+template <typename scalar_t>
+struct ArgMinOps : public detail::ArgReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MinOps : public detail::MinMaxReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MaxOps
+    : public detail::MinMaxReductionOps<detail::GreaterOrNan<scalar_t>> {};
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t>
+struct MinMaxOps {
+  using acc_t = at::xpu::pair<acc_scalar_t, acc_scalar_t>;
+  inline acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    return combine(acc, {data, data});
+  }
+
+  inline acc_t combine(acc_t a, acc_t b) const {
+    auto min_val =
+        (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first;
+    auto max_val =
+        (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second;
+
+    return {min_val, max_val};
+  }
+
+  inline acc_t project(acc_t acc) const {
+    return acc;
+  }
+
+  static acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+};
+
+} // namespace xpu
+} // namespace native
+} // namespace at
+
+#undef MAX
+#undef MIN
diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
index 8db72165a..78062e21a 100644
--- a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
+++ b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -40,7 +40,8 @@ static inline void softmax_group_reduce(
   // uint32_t SIMD = sg.get_local_range()[0];
 #pragma unroll
   for (int i = 1; i < SIMD; i <<= 1) {
-    val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+    val = bin_op(
+        val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
   }
   if (sub_group_num == 1) {
     val = sycl::group_broadcast(sg, val, 0);
@@ -68,7 +69,8 @@ static inline void softmax_group_reduce(
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+      val = bin_op(
+          val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
       if (i >= ((sub_group_num + 1) >> 1))
         break;
     }
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
index fee36b973..b35aba437 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.cpp
@@ -54,6 +54,35 @@ void sign_kernel(TensorIteratorBase& iter) {
   }
 }
 
+template <typename scalar_t>
+struct SignbitIntFunctor {
+  bool operator()(scalar_t a) const {
+    return is_negative(a);
+  }
+};
+
+template <typename scalar_t>
+struct SignbitFunctor {
+  bool operator()(scalar_t a) const {
+    using opmath_t = at::opmath_type<scalar_t>;
+    return std::signbit(opmath_t{a});
+  }
+};
+
+void signbit_kernel(TensorIteratorBase& iter) {
+  // NOTE: signbit does not always support integral arguments.
+  if (at::isIntegralType(iter.input_dtype(), /*includeBool=*/false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.input_dtype(), "signbit_xpu", [&]() {
+      gpu_kernel(iter, SignbitIntFunctor<scalar_t>());
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_xpu", [&]() {
+          gpu_kernel(iter, SignbitFunctor<scalar_t>());
+        });
+  }
+}
+
 template <typename scalar_t>
 struct LogicalNotFunctor {
   scalar_t operator()(scalar_t a) const {
diff --git a/src/ATen/native/xpu/sycl/UnarySignKernels.h b/src/ATen/native/xpu/sycl/UnarySignKernels.h
index 1fefcf035..98b6eeb5f 100644
--- a/src/ATen/native/xpu/sycl/UnarySignKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySignKernels.h
@@ -12,4 +12,6 @@ void sgn_kernel(TensorIteratorBase& iter);
 
 void sign_kernel(TensorIteratorBase& iter);
 
+void signbit_kernel(TensorIteratorBase& iter);
+
 } // namespace at::native::xpu
diff --git a/src/comm/XPUPair.h b/src/comm/XPUPair.h
new file mode 100644
index 000000000..0c36f8625
--- /dev/null
+++ b/src/comm/XPUPair.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <ATen/Context.h>
+
+namespace at::xpu {
+
+template <typename T1, typename T2>
+struct pair {
+  typedef T1 first_type;
+  typedef T2 second_type;
+
+  first_type first;
+  second_type second;
+
+  // default constructor
+  pair(void) : first(), second() {}
+
+  inline pair(const T1& x, const T2& y) : first(x), second(y) {}
+
+  template <typename U1, typename U2>
+  inline pair(const pair<U1, U2>& p) : first(p.first), second(p.second) {}
+
+  template <typename U1, typename U2>
+  pair(const std::pair<U1, U2>& p) : first(p.first), second(p.second) {}
+};
+
+template <typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return x.first == y.first && x.second == y.second;
+}
+
+template <typename T1, typename T2>
+inline bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+
+template <typename T1, typename T2>
+inline bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return !(x == y);
+}
+
+template <typename T1, typename T2>
+inline bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return y < x;
+}
+
+template <typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return !(y < x);
+}
+
+template <typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y) {
+  return !(x < y);
+}
+
+template <typename T1, typename T2>
+inline pair<T1, T2> make_pair(T1 x, T2 y) {
+  return pair<T1, T2>(x, y);
+}
+
+template <unsigned int N, typename T1, typename T2>
+inline auto& get(pair<T1, T2>& p) {
+  if constexpr (N == 0)
+    return p.first;
+  else
+    return p.second;
+}
+
+template <unsigned int N, typename T1, typename T2>
+inline const auto& get(const pair<T1, T2>& p) {
+  if constexpr (N == 0)
+    return p.first;
+  else
+    return p.second;
+}
+
+} // namespace at::xpu
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index ca698993e..f94a9cd5d 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1264,7 +1264,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_RReLU_with_up_down_cuda",
     # AssertionError: Scalars are not close!
     "test_RReLU_with_up_down_scalar_cuda",
-    # lstm: AssertionError: Scalars are not equal!
+    # rnn fallback to cpu
     "test_cudnn_weight_format",
     # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build).
     "test_EmbeddingBag_sparse_cuda",
@@ -1289,16 +1289,12 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_rnn_retain_variables_xpu_float64",
     "test_transformerencoderlayer_xpu_float64",
     "test_variable_sequence_xpu_float64",
-    # CPU fallback fails
-    # AssertionError: Tensor-likes are not close!
+    # native_group_norm : RuntimeError: Expected X.is_contiguous(memory_format) to be true, but got false.  (Could this error message be improved?  If so, please report an enhancement request to PyTorch.)
     "test_GroupNorm_memory_format_xpu",
     # AssertionError: Scalars are not close!
     "test_InstanceNorm1d_general_xpu",
     "test_InstanceNorm2d_general_xpu",
     "test_InstanceNorm3d_general_xpu",
-    # AssertionError: False is not true
-    "test_device_mask_xpu",
-    "test_overwrite_module_params_on_conversion_cpu_device_xpu",
     # AssertionError: RuntimeError not raised
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bicubic_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bilinear_uint8_xpu_uint8",
@@ -1308,8 +1304,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bilinear_uint8_xpu_uint8",
-    "test_grid_sample_error_checking",
-    # Failed: Unexpected success
+    #upsamplingNearest2d: Failed: Unexpected success
     "test_upsamplingNearest2d_launch_fail_xpu",
     # CPU fallback could not cover
     # NotImplementedError: Could not run 'aten::_thnn_fused_gru_cell' with arguments from the 'CPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build pro...
@@ -1329,9 +1324,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half",
     "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half",
     "test_MultiLabelMarginLoss_no_batch_dim_sum_cuda_half",
-    # align CUDA to skip, XPU implementation is not yet supporting uint8
-    "test_upsamplingBiMode2d_consistency",
-    "test_upsamplingBiLinear2d_consistency_interp_size_bug",
 )
 res += launch_test("test_nn_xpu.py", skip_list)
 
@@ -2217,6 +2209,12 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_scaled_mm_vs_emulated_float16_xpu",
     "test_scaled_mm_vs_emulated_float32_xpu",
     "test_scaled_mm_vs_emulated_row_wise_bfloat16_xpu",
+
+    # https://github.com/intel/torch-xpu-ops/issues/676
+    # Mismatched elements: 9 / 1003002 (0.0%)
+    # Greatest absolute difference: 711.126220703125 at index (472, 999) (up to 0.1 allowed)
+    # Greatest relative difference: 2.7107455730438232 at index (472, 997) (up to 0.1 allowed)
+    "test_cublas_addmm_size_1000_xpu_float32",
 )
 res += launch_test("test_matmul_cuda_xpu.py", skip_list=skip_list)
 
diff --git a/test/xpu/test_nn_xpu.py b/test/xpu/test_nn_xpu.py
index 02cd90e7c..fb05f2c78 100644
--- a/test/xpu/test_nn_xpu.py
+++ b/test/xpu/test_nn_xpu.py
@@ -12,6 +12,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+import torch.nn.utils.rnn as rnn_utils
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
@@ -1686,6 +1687,170 @@ def _batch_norm_stats(data, memory_format, mean_axes):
     _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='xpu'), torch.channels_last_3d, (0, 2, 3, 4))
 TestNN.test_sync_batchnorm_accuracy_cuda=_test_sync_batchnorm_accuracy_xpu
 
+@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+@parametrize_test("mode", ["bilinear", "bicubic"])
+@parametrize_test("antialias", [True, False])
+@parametrize_test("align_corners", [True, False])
+@parametrize_test("num_channels", [3, 5])
+@parametrize_test("output_size", [32, 600])
+@parametrize_test("check_as_unsqueezed_3d_tensor", [True, False])
+@parametrize_test("non_contig", [False, "sliced", "restrided"])
+@parametrize_test("batch_size", [1, 5])
+def _test_upsamplingBiMode2d_consistency(
+    self,
+    device,
+    memory_format,
+    mode,
+    antialias,
+    align_corners,
+    num_channels,
+    output_size,
+    check_as_unsqueezed_3d_tensor,
+    non_contig,
+    batch_size,
+):
+    # Check output value consistency between resized_input_uint8 and resized input_float
+    if torch.device(device).type == "xpu":
+        raise SkipTest("XPU implementation is not yet supporting uint8")
+
+    torch.manual_seed(0)
+
+    # - input range is set to [30, 220] for bicubic mode, because the bicubic kernel may create
+    #   [intermediate] values outside of the [0, 255] range, which need
+    #   to be clipped in uint8 path, but not in float path. This isn't
+    #   an issue with bilinear kernel.
+    input_range = (30, 220) if mode == "bicubic" else (0, 256)
+    input_ui8 = torch.randint(*input_range, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device)
+    input_ui8 = input_ui8.contiguous(memory_format=memory_format)
+
+    if non_contig == "sliced":
+        input_ui8 = input_ui8[:, :, 10:-10, 10:-10]
+    elif non_contig == "restrided":
+        input_ui8 = input_ui8[:, :, ::2, ::2]
+
+    if batch_size == 1 and check_as_unsqueezed_3d_tensor:
+        input_ui8 = input_ui8[0, ...]
+        input_ui8 = input_ui8[None, ...]
+
+    input_f32 = input_ui8.float()
+
+    output_f32 = F.interpolate(
+        input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+    ).round().clip(0, 255)
+    output_ui8 = F.interpolate(
+        input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias
+    )
+
+    if non_contig is False:
+        self.assertTrue(input_ui8.is_contiguous(memory_format=memory_format))
+
+    # FIXME if-clause shows the current behaviour which is definitely unexpected.
+    # Ideally we want to fix it such that both the ui8 and f32 outputs are also channels_last
+    # See for more details: https://github.com/pytorch/pytorch/pull/100373
+    if batch_size == 1 and check_as_unsqueezed_3d_tensor and memory_format == torch.channels_last:
+        self.assertTrue(output_ui8.is_contiguous())
+        self.assertTrue(output_f32.is_contiguous())
+    else:
+        self.assertTrue(output_ui8.is_contiguous(memory_format=memory_format))
+        self.assertTrue(output_f32.is_contiguous(memory_format=memory_format))
+
+    if mode == "bilinear":
+        torch.testing.assert_close(output_f32, output_ui8.float(), rtol=0, atol=1)
+    else:
+        diff = (output_f32 - output_ui8.float()).abs()
+        self.assertLess(diff.max(), 15)
+
+        threshold = 2
+        percent = 3
+        self.assertLess((diff > threshold).float().mean(), percent / 100)
+
+        threshold = 5
+        percent = 1
+        self.assertLess((diff > threshold).float().mean(), percent / 100)
+
+        self.assertLess(diff.mean(), 0.4)
+TestNNDeviceType.test_upsamplingBiMode2d_consistency=_test_upsamplingBiMode2d_consistency
+
+@parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last])
+@parametrize_test("align_corners", [True, False])
+@parametrize_test("input_size, output_size", [(399, 437), (403, 377)])
+def _test_upsamplingBiLinear2d_consistency_interp_size_bug(self, device, memory_format, align_corners, input_size, output_size):
+    # Non-regression test for https://github.com/pytorch/pytorch/pull/101403
+
+    if torch.device(device).type == "xpu":
+        raise SkipTest("XPU implementation is not yet supporting uint8")
+
+    mode = "bilinear"
+    input_ui8 = torch.randint(0, 256, size=(1, 3, input_size, input_size), dtype=torch.uint8, device=device)
+    input_ui8 = input_ui8.contiguous(memory_format=memory_format)
+    input_f32 = input_ui8.float()
+
+    output_f32 = F.interpolate(
+        input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False
+    ).round().to(torch.uint8)
+    output_ui8 = F.interpolate(
+        input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False
+    )
+    torch.testing.assert_close(output_f32, output_ui8, atol=1, rtol=0)
+TestNNDeviceType.test_upsamplingBiLinear2d_consistency_interp_size_bug=_test_upsamplingBiLinear2d_consistency_interp_size_bug
+
+def _test_device_mask(self, device):
+    def is_xpu(packed):
+        return packed.data.device.type=="xpu"
+    for enforce_sorted in [True, False]:
+        padded, lengths = self._padded_sequence('cpu', torch.float)
+        packed = rnn_utils.pack_padded_sequence(
+            padded, lengths, enforce_sorted=enforce_sorted)
+        self.assertFalse(is_xpu(packed))
+        packed = packed.to(device)
+        self.assertTrue(is_xpu(packed))
+        unpacked, _ = rnn_utils.pad_packed_sequence(packed)
+        self.assertTrue(is_xpu(unpacked))
+        self.assertEqual(unpacked.dtype, torch.float)
+TestNNDeviceType.test_device_mask=_test_device_mask
+
+def _test_overwrite_module_params_on_conversion_cpu_device(self, device):
+    # Test that under the current default settings
+    # (`torch.__future__.get_overwrite_module_params_on_conversion() == False`),
+    # a view to a module's parameters is not pointing to the same storage as
+    # its base variable after converting the module to a different device.
+    m = nn.Linear(20, 10)
+    mw = m.weight[:]
+    m.to(device)
+    with torch.no_grad():
+        # Without using `torch.no_grad()`, this will leak CUDA memory.
+        # (Issue is filed at https://github.com/pytorch/pytorch/issues/21875)
+        mw[0][0] = 5
+        self.assertTrue(mw[0][0].device.type == "cpu")
+        self.assertTrue(mw._base[0][0].device.type == "xpu")
+
+    try:
+        torch.__future__.set_overwrite_module_params_on_conversion(True)
+
+        # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`,
+        # a view to a module's parameters is still pointing to the same storage as
+        # its base variable after converting the module to a different device.
+        m = nn.Linear(20, 10)
+        mw = m.weight[:]
+        m.to(device)
+        with torch.no_grad():
+            mw[0][0] = 5
+        self.assertTrue(mw[0][0] == mw._base[0][0])
+
+        # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`,
+        # `cpu_module.to("cuda")` doesn't preserve previous references to
+        # `cpu_module`'s parameters or gradients.
+        m = nn.Linear(20, 10)
+        m.weight.grad = torch.randn(10, 20)
+        weight_ref = m.weight
+        weight_grad_ref = m.weight.grad
+        m.to(device)
+        self.assertNotEqual(weight_ref.device, m.weight.device)
+        self.assertNotEqual(weight_grad_ref.device, m.weight.grad.device)
+    finally:
+        torch.__future__.set_overwrite_module_params_on_conversion(False)
+TestNNDeviceType.test_overwrite_module_params_on_conversion_cpu_device=_test_overwrite_module_params_on_conversion_cpu_device
+
 def _test_ctc_loss_xpu(self, device):
     batch_size = 16
     input_length = 30
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index d92474224..483af50f7 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -191,6 +191,8 @@
     "sigmoid",
     "logsigmoid",
     "sgn",
+    "sign",
+    "signbit",
     "round",
     "nn.functional.embedding_bag",
     "bucketize",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 19aa0fe6c..db3df0667 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -473,6 +473,11 @@ supported:
   - sigmoid
   - sigmoid.out
   - sigmoid_
+  - sign
+  - sign.out
+  - sign_
+  - signbit
+  - signbit.out
   - sigmoid_backward.grad_input
   - sigmoid_backward
   - hardsigmoid.out