diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
index 05a5c9706..87c1f0ce3 100644
--- a/src/ATen/native/xpu/ForeachOpScalarList.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -6,12 +6,15 @@
 #include <ATen/ops/_foreach_clamp_max_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
 #include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
 #include <ATen/ops/_foreach_pow_native.h>
 #include <ATen/ops/_foreach_sub_native.h>
+#include <ATen/ops/empty_like.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h>
 
 #include <xpu/ATen/ops/_foreach_add_native.h>
 #include <xpu/ATen/ops/_foreach_mul_native.h>
@@ -118,5 +121,38 @@ FOREACH_BINARY_OP_SCALARLIST(pow, true);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul)
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv)
 
+std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_xpu(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors1, tensors2, scalars);
+  if (!can_use_fast_route({tensors1, tensors2}, scalars, true)) {
+    return foreach_tensor_lerp_scalarlist_kernel_slow(
+        tensors1, tensors2, scalars);
+  }
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors1.size());
+  for (const auto& t : tensors1) {
+    vec_res.emplace_back(at::empty_like(t));
+  }
+
+  xpu::foreach_lerp_scalarlist_kernel(tensors1, tensors2, scalars, vec_res);
+  return vec_res;
+}
+
+void foreach_tensor_lerp_scalarlist_xpu_(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors1, tensors2, scalars);
+  if (!can_use_fast_route({tensors1, tensors2}, scalars, true)) {
+    return foreach_tensor_lerp_scalarlist_kernel_slow_(
+        tensors1, tensors2, scalars);
+  }
+
+  xpu::foreach_lerp_scalarlist_kernel_(tensors1, tensors2, scalars);
+}
+
 }; // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp
index a60eb600c..a9ef1ff44 100644
--- a/src/ATen/native/xpu/ForeachReduceOp.cpp
+++ b/src/ATen/native/xpu/ForeachReduceOp.cpp
@@ -1,6 +1,7 @@
 #include <ATen/native/ForeachUtils.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
+#include <xpu/ATen/ops/_foreach_max_native.h>
 #include <xpu/ATen/ops/_foreach_norm_native.h>
 
 namespace at {
@@ -70,5 +71,23 @@ std::vector<Tensor> foreach_tensor_norm_xpu(
 
   return native::xpu::foreach_norm_kernel(tensors, ord, p, dtype);
 }
+
+std::vector<Tensor> foreach_tensor_max_xpu(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+  if (!can_use_fast_route(tensors)) {
+    return foreach_tensor_max_slow(tensors);
+  }
+
+  // for parity with max in ReduceAllOps.cpp, as max(empty) is ???
+  TORCH_CHECK(
+      std::all_of(
+          tensors.begin(),
+          tensors.end(),
+          [](const auto& t) { return t.numel() > 0; }),
+      "max(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.");
+
+  return at::native::xpu::foreach_max_kernel(tensors);
+}
+
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachUnaryOp.cpp b/src/ATen/native/xpu/ForeachUnaryOp.cpp
index 83577707b..2672d5827 100644
--- a/src/ATen/native/xpu/ForeachUnaryOp.cpp
+++ b/src/ATen/native/xpu/ForeachUnaryOp.cpp
@@ -25,6 +25,7 @@
 #include <ATen/ops/_foreach_neg_native.h>
 #include <ATen/ops/_foreach_reciprocal_native.h>
 #include <ATen/ops/_foreach_round_native.h>
+#include <ATen/ops/_foreach_rsqrt_native.h>
 #include <ATen/ops/_foreach_sigmoid_native.h>
 #include <ATen/ops/_foreach_sign_native.h>
 #include <ATen/ops/_foreach_sin_native.h>
@@ -93,6 +94,7 @@ FOREACH_UNARY_OP(round);
 FOREACH_UNARY_OP(frac);
 FOREACH_UNARY_OP(reciprocal);
 FOREACH_UNARY_OP(sign);
+FOREACH_UNARY_OP(rsqrt);
 
 std::vector<Tensor> foreach_tensor_neg_xpu(TensorList tensors) {
   at::native::check_foreach_api_restrictions(tensors);
diff --git a/src/ATen/native/xpu/sycl/ForeachFunctors.h b/src/ATen/native/xpu/sycl/ForeachFunctors.h
index 9f0f33063..70a362fcd 100644
--- a/src/ATen/native/xpu/sycl/ForeachFunctors.h
+++ b/src/ATen/native/xpu/sycl/ForeachFunctors.h
@@ -834,6 +834,77 @@ struct TernaryOpScalarFunctor {
   }
 };
 
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+
+  template <typename TLA, typename TLW, typename Op>
+  void operator()(
+      int chunk_size,
+      TLA tlAddress,
+      TLW tlWGMeta,
+      sycl::nd_item<1> item_id,
+      Op op) const {
+    static_assert(depth == 2 || depth == 3, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    auto item_idx = item_id.get_local_id(0);
+    auto item_range = item_id.get_local_range(0);
+    auto group_idx = item_id.get_group(0);
+    int tensor_loc = tlWGMeta[group_idx].wg_to_tensor;
+    int chunk_idx = tlWGMeta[group_idx].wg_to_chunk;
+    int64_t n = tlAddress[tensor_loc].numel_to_tensor;
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tlAddress, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+    const opmath_t scalar = tlAddress[tensor_loc].scalar_vals;
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = item_idx;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += item_range) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 scalar);
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += item_range * kILP) {
+        load_args<r_args_depth>(
+            r_args, args, i_start, chunk_size, n, item_idx, item_range);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 scalar);
+        }
+        store_args(
+            args[res_arg_index],
+            r_args[0],
+            i_start,
+            chunk_size,
+            n,
+            item_idx,
+            item_range);
+      }
+    }
+  }
+};
+
 template <typename T>
 struct power_functor {
   T operator()(const T& a, const T& b) const {
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
index 9e982e4a0..19fda8472 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
@@ -5,14 +5,17 @@
 
 #include <ATen/native/xpu/sycl/ForeachFunctors.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
+#include <ATen/xpu/EmptyTensor.h>
 #include <comm/SYCLContext.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
 
+namespace at::native::xpu {
+
 enum class NormType { L1, L2, LInf };
 #define SIMD16 16
 #define SIMD32 32
-namespace at::native::xpu {
+
 template <
     typename T,
     NormType norm_type,
@@ -476,4 +479,274 @@ std::vector<Tensor> foreach_norm_kernel(
   return result;
 }
 
+template <typename T, int SIMD>
+struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  template <typename TLA, typename TLW>
+  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+      int64_t chunk_size,
+      TLA tlAddressMeta,
+      TLW tlWGMeta,
+      sycl::nd_item<1> item,
+      T* output_per_tensor_ptr,
+      const int max_chunks_per_tensor) const {
+    auto workgroup_id = item.get_group(0);
+    auto item_id = item.get_local_id(0);
+    auto local_range = item.get_local_range(0);
+
+    const auto tensor_loc = tlWGMeta[workgroup_id].wg_to_tensor;
+    const auto chunk_idx = tlWGMeta[workgroup_id].wg_to_chunk;
+    auto n = tlAddressMeta[tensor_loc].numel_to_tensor;
+
+    T* x = (T*)tlAddressMeta[tensor_loc].addresses[0];
+    x += chunk_idx * chunk_size;
+    n -= chunk_idx * chunk_size;
+
+    T vals[kILP];
+    T r_x[kILP];
+    for (int64_t i = 0; i < kILP; i++) {
+      vals[i] = T(std::numeric_limits<T>::lowest());
+      r_x[i] = T(std::numeric_limits<T>::lowest());
+    }
+
+    if (n % kILP == 0 && (chunk_size & kILP) == 0 && is_aligned(x)) {
+      for (int64_t i_start = item_id;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += local_range) {
+        // load
+        load_store(r_x, x, 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          vals[ii] = max_impl(vals[ii], r_x[ii]);
+        }
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += local_range * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          int i = i_start + item_id + ii * local_range;
+          if (i < n && i < chunk_size) {
+            vals[ii] = max_impl(vals[ii], x[i]);
+          }
+        }
+      }
+    }
+
+    auto val = T(std::numeric_limits<T>::lowest());
+    for (int i = 0; i < kILP; i++) {
+      val = max_impl(val, vals[i]);
+    }
+    auto final_val =
+        GroupReduceMaxWithoutBroadcast<T, SIMD>(item, val, shared_);
+
+    if (item_id == 0) {
+      output_per_tensor_ptr[tensor_loc * max_chunks_per_tensor + chunk_idx] =
+          final_val;
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<T>(SIMD, cgh);
+  }
+
+ private:
+  sycl_local_acc_t<T> shared_;
+};
+
+template <typename T, int SIMD>
+struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  [[intel::reqd_sub_group_size(SIMD)]] void operator()(
+      sycl::nd_item<1> item_id) const {
+    auto local_range = item_id.get_local_range(0);
+    auto lid = item_id.get_local_linear_id();
+    auto group_id = item_id.get_group(0);
+
+    const T* output_this_tensor =
+        output_per_tensor_ + group_id * max_chunks_per_tensor_;
+    int chunks_this_tensor = chunks_per_tensor_[group_id];
+    T val = std::numeric_limits<T>::lowest();
+    for (int i = lid; i < chunks_this_tensor; i += local_range) {
+      val = max_impl(val, output_this_tensor[i]);
+    }
+    T final_value =
+        GroupReduceMaxWithoutBroadcast<T, SIMD>(item_id, val, shared_);
+    if (lid == 0) {
+      *(ret_per_tensor_[group_id]) = final_value;
+    }
+  }
+
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    shared_ = sycl_local_acc_t<T>(SIMD, cgh);
+  }
+
+  LpmaxChunkReduceKernelFunctor(
+      const T* output_per_tensor,
+      T** ret_per_tensor,
+      int* chunks_per_tensor,
+      int max_chunks_per_tensor)
+      : output_per_tensor_(output_per_tensor),
+        ret_per_tensor_(ret_per_tensor),
+        chunks_per_tensor_(chunks_per_tensor),
+        max_chunks_per_tensor_(max_chunks_per_tensor) {}
+
+ private:
+  const T* output_per_tensor_;
+  T** ret_per_tensor_;
+  int* chunks_per_tensor_;
+  int max_chunks_per_tensor_;
+  sycl_local_acc_t<T> shared_;
+};
+
+template <typename T, int SIMD>
+void launch_lpmax_chunk_reduce_kernel(
+    const T* output_per_tensor,
+    T** ret_per_tensor,
+    int* chunks_per_tensor,
+    int max_chunks_per_tensor,
+    int n_tensor) {
+  int wg_size = multi_tensor_apply_kernel_get_wg_size(SIMD);
+  LpmaxChunkReduceKernelFunctor<T, SIMD> kfn(
+      output_per_tensor,
+      ret_per_tensor,
+      chunks_per_tensor,
+      max_chunks_per_tensor);
+
+  sycl_kernel_submit(
+      sycl::range<1>(n_tensor * wg_size),
+      sycl::range<1>(wg_size),
+      getCurrentSYCLQueue(),
+      kfn);
+}
+
+std::vector<Tensor> foreach_max_kernel(TensorList tensors) {
+  const size_t ntensors = tensors.size();
+  const auto options = tensors[0].options();
+
+  auto& q = getCurrentSYCLQueue();
+  // Store output address for each tensor
+  auto addressStorage =
+      at::empty({(int)(sizeof(void*) * ntensors)}, options.dtype(at::kByte));
+  auto metaAddress = static_cast<void**>(addressStorage.mutable_data_ptr());
+  void** tensor_list_addresses = nullptr;
+  auto tensor_list_addresses_dptr =
+      at::xpu::HostAlloc(sizeof(void*) * ntensors);
+  tensor_list_addresses = (void**)tensor_list_addresses_dptr.get();
+
+  // Store thunks count for each tensor
+  auto countsStorage =
+      at::empty({(int)(sizeof(int) * ntensors)}, options.dtype(at::kByte));
+  auto metaCounts = static_cast<int*>(countsStorage.mutable_data_ptr());
+  int* thunk_counts = nullptr;
+  auto thunk_counts_dptr = at::xpu::HostAlloc(sizeof(int) * ntensors);
+  thunk_counts = (int*)thunk_counts_dptr.get();
+
+  int max_chunks_per_tensor = -1;
+  int64_t simd = syclMaxSubGroupSize();
+  int64_t kChunkSize = multi_tensor_apply_kernel_get_chunk_size(simd);
+  for (const auto t : c10::irange(ntensors)) {
+    int max_chunks_this_tensor =
+        (tensors[t].numel() + kChunkSize - 1) / kChunkSize;
+    thunk_counts[t] = max_chunks_this_tensor;
+    if (max_chunks_this_tensor > max_chunks_per_tensor) {
+      max_chunks_per_tensor = max_chunks_this_tensor;
+    }
+  }
+  auto output_per_tensor = at::zeros(
+      {static_cast<int64_t>(ntensors) * max_chunks_per_tensor}, options);
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(ntensors);
+  for (const auto i : c10::irange(ntensors)) {
+    vec_res.push_back(at::detail::empty_xpu(
+        {},
+        optTypeMetaToScalarType(options.dtype_opt()),
+        options.layout_opt(),
+        options.device_opt(),
+        options.pinned_memory_opt(),
+        options.memory_format_opt()));
+  }
+
+  auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kHalf,
+      kBFloat16,
+      kBool,
+      tensor_lists[0][0].scalar_type(),
+      "foreach_tensor_max_xpu_scalar_type",
+      [&]() {
+        if (simd == SIMD32) {
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpMaxFunctor<scalar_t, SIMD32>(),
+              output_per_tensor.mutable_data_ptr<scalar_t>(),
+              max_chunks_per_tensor);
+        } else if (simd == SIMD16) {
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpMaxFunctor<scalar_t, SIMD16>(),
+              output_per_tensor.mutable_data_ptr<scalar_t>(),
+              max_chunks_per_tensor);
+        } else {
+          TORCH_CHECK(
+              false,
+              "foreach_max_xpu_kernel does not support SIMD width: ",
+              simd);
+        }
+
+        for (int i = 0; i < ntensors; i++) {
+          tensor_list_addresses[i] = vec_res[i].mutable_data_ptr<scalar_t>();
+        }
+        q.memcpy(
+            (void*)metaAddress,
+            (void*)tensor_list_addresses,
+            sizeof(void*) * ntensors);
+        at::xpu::CachingHostAllocator_recordEvent(
+            (void*)tensor_list_addresses,
+            tensor_list_addresses_dptr.get_context(),
+            at::xpu::getCurrentXPUStream());
+        q.memcpy(
+            (void*)metaCounts, (void*)thunk_counts, sizeof(int) * ntensors);
+        at::xpu::CachingHostAllocator_recordEvent(
+            (void*)thunk_counts,
+            thunk_counts_dptr.get_context(),
+            at::xpu::getCurrentXPUStream());
+        if (simd == SIMD32) {
+          launch_lpmax_chunk_reduce_kernel<scalar_t, SIMD32>(
+              output_per_tensor.mutable_data_ptr<scalar_t>(),
+              (scalar_t**)(metaAddress),
+              (int*)(metaCounts),
+              max_chunks_per_tensor,
+              ntensors);
+        } else {
+          launch_lpmax_chunk_reduce_kernel<scalar_t, SIMD16>(
+              output_per_tensor.mutable_data_ptr<scalar_t>(),
+              (scalar_t**)(metaAddress),
+              (int*)(metaCounts),
+              max_chunks_per_tensor,
+              ntensors);
+        }
+      });
+
+  // correctly assign values to only non-empty slots, as the empty slots should
+  // get skipped
+  std::vector<Tensor> result;
+  result.reserve(ntensors);
+  int i = 0;
+  for (const auto& t : tensors) {
+    if (t.numel() != 0) {
+      result.emplace_back(vec_res[i]);
+      i++;
+    } else {
+      result.emplace_back(at::detail::empty_xpu(
+          {},
+          optTypeMetaToScalarType(options.dtype_opt()),
+          options.layout_opt(),
+          options.device_opt(),
+          options.pinned_memory_opt(),
+          options.memory_format_opt()));
+    }
+  }
+  return result;
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
index cefb3d840..e7413b50c 100644
--- a/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachReduceKernels.h
@@ -9,4 +9,6 @@ TORCH_XPU_API std::vector<Tensor> foreach_norm_kernel(
     double p,
     c10::optional<ScalarType> dtype);
 
+TORCH_XPU_API std::vector<Tensor> foreach_max_kernel(TensorList tensors);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
index ed8e01653..53e937f0f 100644
--- a/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryKernels.cpp
@@ -7,6 +7,7 @@
 
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h>
+#include <ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h>
 
 namespace at::native::xpu {
 
@@ -123,4 +124,58 @@ void foreach_lerp_scalar_kernel_(
             weight.to<opmath_t>());
       });
 }
+
+void foreach_lerp_scalarlist_kernel(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars,
+    TensorList result) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec(), result.vec()};
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_scalarlist_xpu",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<3, opmath_t>(
+            tensor_lists,
+            scalars,
+            TernaryOpScalarListFunctor<
+                scalar_t,
+                /* depth */ 3,
+                /* r_args_depth */ 2,
+                /* res_arg_index */ 2>(),
+            LerpFunctor<opmath_t>());
+      });
+};
+
+void foreach_lerp_scalarlist_kernel_(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec()};
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_scalarlist_xpu_",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<2, opmath_t>(
+            tensor_lists,
+            scalars,
+            TernaryOpScalarListFunctor<
+                scalar_t,
+                /* depth */ 2,
+                /* r_args_depth */ 2,
+                /* res_arg_index */ 0>(),
+            LerpFunctor<opmath_t>());
+      });
+};
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h
new file mode 100644
index 000000000..cd3afd665
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <comm/xpu_aten.h>
+
+namespace at::native::xpu {
+
+TORCH_XPU_API void foreach_lerp_scalarlist_kernel(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars,
+    TensorList result);
+
+TORCH_XPU_API void foreach_lerp_scalarlist_kernel_(
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars);
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
index b24cd2c81..073ab8908 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.cpp
@@ -7,6 +7,7 @@
 #include <ATen/native/xpu/sycl/ForeachUnaryKernels.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/MultiTensorApply.h>
+#include <comm/XPUMathCompat.h>
 #include <comm/xpu_aten.h>
 
 namespace at::native::xpu {
@@ -391,6 +392,29 @@ struct Sign {
   }
 };
 
+template <typename T>
+struct Rsqrt {
+  T operator()(T t) const {
+    return c10::xpu::compat::rsqrt(t);
+  }
+};
+
+template <>
+struct Rsqrt<c10::complex<float>> {
+  c10::complex<float> operator()(c10::complex<float> t) const {
+    const auto one = c10::complex<float>(1.0, 0);
+    return one / std::sqrt(t);
+  }
+};
+
+template <>
+struct Rsqrt<c10::complex<double>> {
+  c10::complex<double> operator()(c10::complex<double> t) const {
+    const auto one = c10::complex<double>(1.0, 0);
+    return one / std::sqrt(t);
+  }
+};
+
 FOREACH_UNARY_INPLACE_KERNEL(sigmoid) {
   return floating_complex_half_bfloat16_<Sigmoid>(tensors);
 }
@@ -426,6 +450,13 @@ FOREACH_UNARY_KERNEL(sign) {
   return floating_half_bfloat16<Sign>(tensors);
 }
 
+FOREACH_UNARY_INPLACE_KERNEL(rsqrt) {
+  return floating_complex_half_bfloat16_<Rsqrt>(tensors);
+}
+FOREACH_UNARY_KERNEL(rsqrt) {
+  return floating_complex_half_bfloat16<Rsqrt>(tensors);
+}
+
 FOREACH_UNARY_INPLACE_KERNEL(neg) {
   return all_types_half_complex_bfloat16_<std::negate>(tensors);
 }
diff --git a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
index 4af3ffede..8ac04f41e 100644
--- a/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/ForeachUnaryKernels.h
@@ -94,6 +94,9 @@ TORCH_XPU_API FOREACH_UNARY_KERNEL(reciprocal);
 TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(sign);
 TORCH_XPU_API FOREACH_UNARY_KERNEL(sign);
 
+TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(rsqrt);
+TORCH_XPU_API FOREACH_UNARY_KERNEL(rsqrt);
+
 TORCH_XPU_API FOREACH_UNARY_INPLACE_KERNEL(neg);
 TORCH_XPU_API FOREACH_UNARY_KERNEL(neg);
 
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 5bd989848..02b9e69d9 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -679,6 +679,10 @@
         # FATAL: Unexpected page fault from GPU at 0x0, ctx_id: 1 (CCS) type: 0 (NotPresent), level: 3 (PML4), access: 0 (Read), banned: 1, aborting.
         # FATAL: Unexpected page fault from GPU at 0x0, ctx_id: 1 (CCS) type: 0 (NotPresent), level: 3 (PML4), access: 0 (Read), banned: 1, aborting.
         "test_dtypes__refs_nn_functional_pdist_xpu",
+
+        # 2025 bundle std::pow complex result is different on host and device
+        "test_python_ref__refs_square_xpu_complex64",
+        "test_python_ref_torch_fallback__refs_square_xpu_complex64",
     ),
 
     "test_binary_ufuncs_xpu.py": (
@@ -1245,6 +1249,15 @@
         # Compiler issue in handling tanh with real or imag inf.
         # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974
         "test_reference_numerics_large__refs_tanh_xpu_complex32",
+
+        # 2025 bundle std::pow complex result is different on host and device
+        "test_exp_xpu_complex64",
+        "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
+        "test_reference_numerics_extremal__refs_exp_xpu_complex64",
+        "test_reference_numerics_extremal_exp2_xpu_complex64",
+        "test_reference_numerics_extremal_exp_xpu_complex64",
+        "test_reference_numerics_large__refs_exp_xpu_complex32",
+        "test_reference_numerics_large_exp_xpu_complex32",
     ),
 
     "test_masked_xpu.py": (
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index 64e94073b..6b0a1221d 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -3094,6 +3094,29 @@
   dispatch:
     CompositeExplicitAutograd: _foreach_copy
 
+- func: _foreach_max(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_max_slow
+    XPU: foreach_tensor_max_xpu
+  autogen: _foreach_max.out
+
+- func: _foreach_rsqrt(Tensor[] self) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_rsqrt_slow
+    XPU: foreach_tensor_rsqrt_xpu
+
+- func: _foreach_rsqrt_(Tensor(a!)[] self) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_rsqrt_slow_
+    XPU: foreach_tensor_rsqrt_xpu_
+  autogen: _foreach_rsqrt.out
+
 - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
   dispatch:
@@ -3688,6 +3711,22 @@
     XPU: foreach_tensor_lerp_list_xpu_
   autogen: _foreach_lerp.Scalar_out
 
+- func: _foreach_lerp.ScalarList(Tensor[] self, Tensor[] tensors1, Scalar[] weight) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_lerp_scalarlist_kernel_slow
+    XPU: foreach_tensor_lerp_scalarlist_xpu
+  autogen: _foreach_lerp.ScalarList_out
+
+- func: _foreach_lerp_.ScalarList(Tensor(a!)[] self, Tensor[] tensors1, Scalar[] weight) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensors are on different devices
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: foreach_tensor_lerp_scalarlist_kernel_slow_
+    XPU: foreach_tensor_lerp_scalarlist_xpu_
+  autogen: _foreach_lerp.ScalarList_out
+
 - func: acos(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method