diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp
index 6813a91ae..73f23f39e 100644
--- a/src/ATen/native/xpu/ForeachOpList.cpp
+++ b/src/ATen/native/xpu/ForeachOpList.cpp
@@ -1,4 +1,11 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+
 #include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
@@ -8,29 +15,6 @@
 namespace at {
 namespace native {
 
-::std::vector<at::Tensor> foreach_tensor_mul_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other);
-void foreach_tensor_mul_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other);
-
-::std::vector<at::Tensor> foreach_tensor_div_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other);
-void foreach_tensor_div_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other);
-
-::std::vector<at::Tensor> foreach_tensor_add_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList other,
-    const at::Scalar& alpha);
-void foreach_tensor_add_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList other,
-    const at::Scalar& alpha);
-
 #define FOREACH_BINARY_OP_LIST(NAME, DIVISION_OP)                           \
   void foreach_tensor_##NAME##_list_kernel_xpu_(                            \
       TensorList tensors1, TensorList tensors2) {                           \
@@ -81,28 +65,6 @@ FOREACH_BINARY_OP_LIST_ALPHA(add);
 FOREACH_BINARY_OP_LIST(mul, false);
 FOREACH_BINARY_OP_LIST(div, true);
 
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcmul_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
 #define FOREACH_POINTWISE_OP_TENSOR(NAME)                                  \
   std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_xpu(             \
       TensorList input,                                                    \
@@ -142,11 +104,6 @@ void foreach_tensor_addcdiv_scalarlist_slow_(
 FOREACH_POINTWISE_OP_TENSOR(addcmul)
 FOREACH_POINTWISE_OP_TENSOR(addcdiv)
 
-::std::vector<at::Tensor> foreach_tensor_ternary_lerp_slow(
-    at::TensorList self,
-    at::TensorList tensors1,
-    at::TensorList weights);
-
 std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
     TensorList tensors1,
     TensorList tensors2,
@@ -166,11 +123,6 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_xpu(
   return vec_res;
 }
 
-void foreach_tensor_ternary_lerp_slow_(
-    at::TensorList self,
-    at::TensorList tensors1,
-    at::TensorList weights);
-
 void foreach_tensor_lerp_ternary_xpu_(
     TensorList tensors1,
     TensorList tensors2,
diff --git a/src/ATen/native/xpu/ForeachOpScalar.cpp b/src/ATen/native/xpu/ForeachOpScalar.cpp
index 46b908ced..5a581a289 100644
--- a/src/ATen/native/xpu/ForeachOpScalar.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalar.cpp
@@ -1,34 +1,18 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 
 namespace at {
-
 namespace native {
 
-::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_add_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_mul_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_div_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
 #define FOREACH_BINARY_OP_SCALAR(NAME, DIV_OP)                             \
   void foreach_tensor_##NAME##_scalar_kernel_xpu_(                         \
       TensorList tensors, const Scalar& scalar) {                          \
@@ -54,28 +38,6 @@ FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);
 
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-void foreach_tensor_addcmul_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-void foreach_tensor_addcdiv_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value);
-
 #define FOREACH_POINTWISE_OP_SCALAR(NAME)                                   \
   std::vector<Tensor> foreach_tensor_##NAME##_scalar_xpu(                   \
       TensorList input,                                                     \
@@ -112,15 +74,6 @@ void foreach_tensor_addcdiv_scalar_slow_(
 FOREACH_POINTWISE_OP_SCALAR(addcmul)
 FOREACH_POINTWISE_OP_SCALAR(addcdiv)
 
-::std::vector<at::Tensor> foreach_tensor_lerp_list_kernel_slow(
-    at::TensorList self,
-    at::TensorList tensors1,
-    const at::Scalar& weight);
-void foreach_tensor_lerp_list_kernel_slow_(
-    at::TensorList self,
-    at::TensorList tensors1,
-    const at::Scalar& weight);
-
 std::vector<at::Tensor> foreach_tensor_lerp_list_xpu(
     TensorList tensors1,
     TensorList tensors2,
diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
index 6ac047476..1433e08bd 100644
--- a/src/ATen/native/xpu/ForeachOpScalarList.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -1,4 +1,9 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
@@ -8,44 +13,6 @@
 
 namespace at {
 namespace native {
-::std::vector<at::Tensor> foreach_tensor_add_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_add_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-::std::vector<at::Tensor> foreach_tensor_mul_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_mul_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-
-::std::vector<at::Tensor> foreach_tensor_add_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_add_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-::std::vector<at::Tensor> foreach_tensor_mul_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_mul_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-
-::std::vector<at::Tensor> foreach_tensor_div_scalar_kernel_slow(
-    at::TensorList self,
-    const at::Scalar& scalar);
-void foreach_tensor_div_scalar_kernel_slow_(
-    at::TensorList self,
-    const at::Scalar& scalar);
-::std::vector<at::Tensor> foreach_tensor_div_scalarlist_kernel_slow(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_div_scalarlist_kernel_slow_(
-    at::TensorList self,
-    at::ArrayRef<at::Scalar> scalars);
 
 #define FOREACH_BINARY_OP_SCALARLIST(NAME, DIV_OP)                             \
   void foreach_tensor_##NAME##_scalar_kernel_xpu_(                             \
@@ -74,47 +41,6 @@ FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);
 
-void foreach_tensor_addcmul_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcmul_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalar_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalar_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    const at::Scalar& value = 1);
-::std::vector<at::Tensor> foreach_tensor_addcdiv_scalarlist_slow(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-void foreach_tensor_addcdiv_scalarlist_slow_(
-    at::TensorList self,
-    at::TensorList tensor1,
-    at::TensorList tensor2,
-    at::ArrayRef<at::Scalar> scalars);
-
 #define FOREACH_POINTWISE_OP_SCALARLIST(NAME)                                \
   std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_xpu(                \
       TensorList input,                                                      \
diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp
index 89cd0ab4e..4492d8313 100644
--- a/src/ATen/native/xpu/ForeachUnaryOp.cpp
+++ b/src/ATen/native/xpu/ForeachUnaryOp.cpp
@@ -1,4 +1,6 @@
 #include <ATen/native/ForeachUtils.h>
+#include <ATen/ops/_foreach_sqrt_native.h>
+
 #include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
 
 namespace at {
@@ -6,9 +8,6 @@ namespace native {
 // given a functor and a "dispatch function", creates the outplace and inplace
 // operations
 
-::std::vector<at::Tensor> foreach_tensor_sqrt_slow(at::TensorList self);
-void foreach_tensor_sqrt_slow_(at::TensorList self);
-
 #define FOREACH_UNARY_OP(op_name)                                          \
   std::vector<Tensor> foreach_tensor_##op_name##_xpu(TensorList tensors) { \
     check_foreach_api_restrictions(tensors);                               \
diff --git a/src/ATen/native/xpu/Nonzero.cpp b/src/ATen/native/xpu/Nonzero.cpp
index deb646f6c..9988631d3 100644
--- a/src/ATen/native/xpu/Nonzero.cpp
+++ b/src/ATen/native/xpu/Nonzero.cpp
@@ -5,7 +5,7 @@
 #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 
 namespace at {
-namespace native{
+namespace native {
 Tensor& nonzero_out_xpu(const Tensor& self, Tensor& out) {
   TORCH_CHECK(
       self.numel() < std::numeric_limits<int>::max(),
@@ -38,5 +38,5 @@ Tensor nonzero_xpu(const Tensor& self) {
   nonzero_out_xpu(self, out);
   return out;
 }
-}
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index ad0a6ffc6..c9376f177 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -31,7 +31,8 @@ Tensor& arange_out_xpu(
 
         TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
         TORCH_CHECK(
-            std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+            std::isfinite(static_cast<double>(xstart)) &&
+                std::isfinite(static_cast<double>(xend)),
             "unsupported range: ",
             xstart,
             " -> ",
@@ -99,7 +100,8 @@ Tensor& range_xpu_out(
 
   TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
   TORCH_CHECK(
-      std::isfinite(static_cast<double>(xstart)) && std::isfinite(static_cast<double>(xend)),
+      std::isfinite(static_cast<double>(xstart)) &&
+          std::isfinite(static_cast<double>(xend)),
       "unsupported range: ",
       xstart,
       " -> ",
diff --git a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
index f60613ec7..fd1b966d7 100644
--- a/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ActivationGluKernels.cpp
@@ -2,7 +2,6 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorIterator.h>
 
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <comm/SYCLContext.h>
 
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
index 21168f8a3..ed8e01653 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -5,8 +5,8 @@
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
 
-#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index 230b693f5..04f2021ea 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -4,9 +4,13 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void addcmul_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcmul_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
-TORCH_XPU_API void addcdiv_kernel(TensorIteratorBase& iter, const Scalar& value);
+TORCH_XPU_API void addcdiv_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& value);
 
 TORCH_XPU_API void mse_backward_kernel(
     TensorIterator& iter,
diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
index e080511d2..7b19fa4db 100644
--- a/src/ATen/native/xpu/sycl/PowKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -38,7 +38,8 @@ static inline c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
 } // namespace impl
 
 #ifdef _MSC_VER
-// Divergence for MSVC due to accuracy issue. https://github.com/intel/torch-xpu-ops/issues/842.
+// Divergence for MSVC due to accuracy issue.
+// https://github.com/intel/torch-xpu-ops/issues/842.
 template <typename scalar_t>
 struct PowTensorTensorCastFunctor {
   using opmath_t = at::opmath_type<scalar_t>;
diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
index ef405be49..4527d51bd 100644
--- a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
+++ b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/Dispatch.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>
-#include <ATen/ops/imag.h>
 #include <ATen/native/xpu/sycl/SharedReduceOps.h>
+#include <ATen/ops/imag.h>
 
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
 
diff --git a/src/ATen/native/xpu/sycl/Sorting.cpp b/src/ATen/native/xpu/sycl/Sorting.cpp
index cf41810dc..05fba0bb9 100644
--- a/src/ATen/native/xpu/sycl/Sorting.cpp
+++ b/src/ATen/native/xpu/sycl/Sorting.cpp
@@ -16,8 +16,8 @@
 #include <ATen/native/xpu/sycl/SortingRadixSelect.h>
 #include <c10/macros/Macros.h>
 
-#include <ATen/ops/empty_strided.h>
 #include <ATen/native/xpu/sycl/Sorting.h>
+#include <ATen/ops/empty_strided.h>
 
 namespace at::native::xpu {
 
diff --git a/src/bridge.cpp b/src/bridge.cpp
index c19ce2554..714a91103 100644
--- a/src/bridge.cpp
+++ b/src/bridge.cpp
@@ -20,10 +20,11 @@ class LoadTorchXPUOps {
   LoadTorchXPUOps() {
     if (NULL == LoadLibrary(PATH_TO_TORCH_XPU_OPS_ATEN_LIB)) {
       std::ostringstream error;
-      error << "PyTorch XPU operators library is not loaded (ERROR: "
-            << GetLastError()
-            << "). Please check if PyTorch is installed correctly."
-            << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues.";
+      error
+          << "PyTorch XPU operators library is not loaded (ERROR: "
+          << GetLastError()
+          << "). Please check if PyTorch is installed correctly."
+          << " Or please file an issue on https://github.com/intel/torch-xpu-ops/issues.";
       throw std::runtime_error(error.str());
     }
   }