diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml
new file mode 100644
index 000000000..7cf2746c3
--- /dev/null
+++ b/.github/workflows/_linux_ut.yml
@@ -0,0 +1,123 @@
+name: inductor-xpu-ut-test
+
+on:
+  workflow_call:
+    inputs:
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
+      runner:
+        required: true
+        type: string
+        default: 'linux.idc.xpu'
+        description: Set runner
+
+
+jobs:
+  Inductor-XPU-UT-Tests:
+    runs-on: ${{ inputs.runner }} 
+    timeout-minutes: 900
+    steps:
+      - name: Checkout torch-xpu-ops
+        uses: actions/checkout@v4
+      - name: Prepare Stock Pytorch
+        run: |
+          pwd
+          cd ../ && rm -rf pytorch
+          git clone -b ${{ inputs.pytorch_branch }} https://github.com/pytorch/pytorch
+          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
+          if [ -z ${{ inputs.torch_xpu_ops_update }} ]; then
+            rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+          else
+            if [[ ${{ inputs.torch_xpu_ops_update }} == 'true' ]]; then
+              rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
+            else
+              echo "Not update torch-xpu-ops"
+            fi
+          fi
+          # Workaround for torch-xpu-ops ci test
+          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
+      - name: Build Pytorch XPU
+        run: |
+          which conda && conda clean -ay
+          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
+                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
+          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          conda install -c intel mkl-static mkl-include -y
+          cd ../pytorch
+          pip install -r requirements.txt
+          export USE_XPU=1
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+          python setup.py bdist_wheel
+          pip install --force-reinstall dist/*.whl
+          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
+          pip install -r .ci/docker/requirements-ci.txt
+      - name: Run XPU OP Examples
+        if: contains(inputs.ut_suite, 'op_example')
+        run: |
+          cd ${{ github.workspace }}
+          mkdir -p ut_log
+          xpu-smi discovery
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ${{ github.workspace }}
+          cd examples
+          pip install pytest
+          timeout 8000 pytest -v 
+      - name: Run XPU OP Extended UT
+        if: contains(inputs.ut_suite, 'op_extended')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
+          timeout 10000 python run_test_with_skip.py 
+      - name: Run XPU OP UT
+        if: contains(inputs.ut_suite, 'op_ut')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          export PYTORCH_TEST_WITH_SLOW=1
+          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
+          timeout 10000 python run_test_with_skip.py 
+          # Cases run with a on-demand white list, since some suites are too
+          # slow to go through all operators on CPU. So add cases on-demand
+          # when XPU implementatoin is done.
+          # test_foreach, test_decomp
+          timeout 10000 python run_test_with_only.py 
+      - name: Run Torch XPU UT
+        if: contains(inputs.ut_suite, 'torch_xpu')
+        run: |
+          source /opt/intel/oneapi/compiler/latest/env/vars.sh
+          source activate xpu_op_${ZE_AFFINITY_MASK}
+          cd ../pytorch
+          TEST_REPORTS_DIR=$(pwd)/test/test-reports
+          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
+          # Run Pytorch XPU binary UT
+          for xpu_case in build/bin/*{xpu,sycl}*; do
+            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
+              case_name=$(basename "$xpu_case")
+              echo "Testing ${case_name} ..."
+              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml 
+            fi
+          done
+          # Run Pytorch XPU python UT
+          export PYTORCH_ENABLE_XPU_FALLBACK=1
+          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
+          python test/run_test.py --xpu 
+
diff --git a/.github/workflows/inductor_xpu_e2e_nightly.yml b/.github/workflows/inductor_xpu_e2e_nightly.yml
index 8307edae7..a8d316580 100644
--- a/.github/workflows/inductor_xpu_e2e_nightly.yml
+++ b/.github/workflows/inductor_xpu_e2e_nightly.yml
@@ -41,6 +41,21 @@ on:
         type: string
         default: ''
         description: If set, will only launch this one
+      torch_xpu_ops_update:
+        required: false
+        type: string
+        default: 'true'
+        description: True means update xpu_ops when building pytorch, otherwise means not
+      ut_suite:
+        required: true
+        type: string
+        default: 'op_example,op_extended,op_ut,torch_xpu'
+        description: op_example,op_extended,op_ut,torch_xpu. Delimiter is comma
+      pytorch_branch:
+        required: false
+        type: string
+        default: 'main'
+        description: Set pytorch branch
 
 
 permissions: read-all
@@ -244,6 +259,15 @@ jobs:
           name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
           path: ${{ github.workspace }}/upload_files
 
+  Inductor-XPU-UT-Nightly-Tests:
+    if: ${{ inputs.ut_suite }}
+    name: Nightly Inductor XPU UT Test
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: ${{ inputs.ut_suite }}
+      pytorch_branch: ${{ inputs.pytorch_branch }}
+      runner: linux.idc.xpu
+      
   Tests-Failure-And-Report:
     if: always()
     runs-on: pvc_e2e
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index bd65effa9..1bd635d1a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -23,84 +23,8 @@ jobs:
     # Don't run on forked repos and draft PRs
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     name: preci-ut
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    steps:
-      - name: Checkout torch-xpu-ops
-        uses: actions/checkout@v3
-      - name: Prepare Stock Pytorch
-        run: |
-          pwd
-          cd ../ && rm -rf pytorch
-          git clone -b main https://github.com/pytorch/pytorch
-          cd pytorch && git log -n 1 && git submodule sync && git submodule update --init --recursive
-          rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
-          # Workaround for torch-xpu-ops ci test
-          sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
-      - name: Build Pytorch XPU
-        run: |
-          which conda && conda clean -ay
-          conda remove --all -y -n xpu_op_${ZE_AFFINITY_MASK} || \
-                rm -rf $(dirname ${CONDA_EXE})/../envs/xpu_op_${ZE_AFFINITY_MASK}
-          conda create -n xpu_op_${ZE_AFFINITY_MASK} python=3.10 cmake ninja -y
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          conda install -c intel mkl-static mkl-include -y
-          cd ../pytorch
-          pip install -r requirements.txt
-          export USE_XPU=1
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-          python setup.py bdist_wheel
-          pip install --force-reinstall dist/*.whl
-          git clone https://github.com/pytorch/vision && cd vision && python setup.py install && cd ..
-          pip install -r .ci/docker/requirements-ci.txt
-      - name: Run XPU OP Examples
-        if: ${{ hashFiles('examples/') != '' }}
-        run: |
-          xpu-smi discovery
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd examples
-          pip install pytest
-          timeout 8000 pytest -v
-      - name: Run XPU OP Extended UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu/extended/
-          timeout 10000 python run_test_with_skip.py
-      - name: Run XPU OP UT
-        if: ${{ hashFiles('test/xpu/') != '' }}
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          export PYTORCH_TEST_WITH_SLOW=1
-          cd ../pytorch/third_party/torch-xpu-ops/test/xpu
-          timeout 10000 python run_test_with_skip.py
-          # Cases run with a on-demand white list, since some suites are too
-          # slow to go through all operators on CPU. So add cases on-demand
-          # when XPU implementatoin is done.
-          # test_foreach, test_decomp
-          timeout 10000 python run_test_with_only.py
-      - name: Run Torch XPU UT
-        run: |
-          source /opt/intel/oneapi/compiler/latest/env/vars.sh
-          source activate xpu_op_${ZE_AFFINITY_MASK}
-          cd ../pytorch
-          TEST_REPORTS_DIR=$(pwd)/test/test-reports
-          rm -rf "$TEST_REPORTS_DIR" && mkdir -p "$TEST_REPORTS_DIR"
-          # Run Pytorch XPU binary UT
-          for xpu_case in build/bin/*{xpu,sycl}*; do
-            if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-              case_name=$(basename "$xpu_case")
-              echo "Testing ${case_name} ..."
-              "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-            fi
-          done
-          # Run Pytorch XPU python UT
-          export PYTORCH_ENABLE_XPU_FALLBACK=1
-          sed -i 's/selected_tests = exclude_tests(XPU_BLOCKLIST.*/selected_tests = XPU_TEST/g' ./test/run_test.py
-          python test/run_test.py --xpu
+    uses: ./.github/workflows/_linux_ut.yml
+    with: 
+      ut_suite: op_example,op_extended,op_ut,torch_xpu
+      runner: linux.idc.xpu
+      
\ No newline at end of file
diff --git a/src/ATen/native/xpu/Bucketization.cpp b/src/ATen/native/xpu/Bucketization.cpp
new file mode 100644
index 000000000..0d6c2a9f5
--- /dev/null
+++ b/src/ATen/native/xpu/Bucketization.cpp
@@ -0,0 +1,125 @@
+#include <ATen/native/BucketizationUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/xpu/sycl/BucketizationKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+namespace at {
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> sorter_maybe_owned =
+      at::borrow_from_optional_tensor(sorter_opt);
+  const Tensor& sorter = *sorter_maybe_owned;
+  at::native::searchsorted_pre_check(
+      sorted_sequence, self, result, out_int32, right, side_opt, sorter);
+  at::native::resize_output(result, self.sizes());
+
+  if (self.numel() == 0) {
+    return result;
+  }
+
+  // we have two inputs to set right, pre_check checks that they aren't set to
+  // opposites
+  bool is_right = (side_opt && *side_opt == "right") || right;
+  at::native::xpu::searchsorted_kernel(
+      result, self, sorted_sequence, out_int32, is_right, sorter);
+  return result;
+}
+
+Tensor& XPUNativeFunctions::searchsorted_out(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_out(
+      sorted_sequence,
+      scalar_tensor,
+      out_int32,
+      right,
+      side_opt,
+      sorter_opt,
+      result);
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  searchsorted_out(
+      sorted_sequence, self, out_int32, right, side_opt, sorter, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::searchsorted(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  const Tensor& scalar_tensor =
+      at::native::searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted(
+      sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
+}
+
+Tensor& XPUNativeFunctions::bucketize_out(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& result) {
+  TORCH_CHECK(
+      boundaries.dim() == 1,
+      "boundaries tensor must be 1 dimension, but got dim(",
+      boundaries.dim(),
+      ")");
+  searchsorted_out(
+      boundaries, self, out_int32, right, nullopt, nullopt, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options =
+      TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  bucketize_out(self, boundaries, out_int32, right, result);
+  return result;
+}
+
+Tensor XPUNativeFunctions::bucketize(
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right) {
+  return bucketize(
+      at::native::searchsorted_scalar_tensor(self, boundaries.device()),
+      boundaries,
+      out_int32,
+      right);
+}
+} // namespace at
diff --git a/src/ATen/native/xpu/LinearAlgebra.cpp b/src/ATen/native/xpu/LinearAlgebra.cpp
index dfc7bd70b..2f857f18b 100644
--- a/src/ATen/native/xpu/LinearAlgebra.cpp
+++ b/src/ATen/native/xpu/LinearAlgebra.cpp
@@ -1,15 +1,9 @@
 #include <ATen/ATen.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/WrapDimUtils.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/core/op_registration/adaption.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/utils/ParamUtils.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
 #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/RegisterUtils.h>
 
 namespace at {
diff --git a/src/ATen/native/xpu/PointwiseOps.cpp b/src/ATen/native/xpu/PointwiseOps.cpp
index 210cec3e6..a01bdc391 100644
--- a/src/ATen/native/xpu/PointwiseOps.cpp
+++ b/src/ATen/native/xpu/PointwiseOps.cpp
@@ -6,6 +6,63 @@
 
 namespace at {
 
+TensorIterator addcdiv_meta(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value,
+    Tensor& out) {
+  if (isIntegralType(tensor1.scalar_type(), /*includeBool=*/true) &&
+      isIntegralType(tensor2.scalar_type(), /*includeBool=*/true)) {
+    TORCH_CHECK(
+        false,
+        "Integer division with addcdiv is no longer supported, and in a future  ",
+        "release addcdiv will perform a true division of tensor1 and tensor2. ",
+        "The historic addcdiv behavior can be implemented as ",
+        "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ",
+        "for integer inputs and as ",
+        "(input + value * tensor1 / tensor2) for float inputs. ",
+        "The future addcdiv behavior is just the latter implementation: ",
+        "(input + value * tensor1 / tensor2), for all dtypes.");
+  }
+
+  TensorIterator iter;
+  iter.build_ternary_op(out, self, tensor1, tensor2);
+  return iter;
+}
+
+Tensor& XPUNativeFunctions::addcdiv_out(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value,
+    Tensor& out) {
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
+  native::xpu::addcdiv_kernel(iter, value);
+  return out;
+}
+
+Tensor XPUNativeFunctions::addcdiv(
+    const Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value) {
+  Tensor out;
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, out);
+  native::xpu::addcdiv_kernel(iter, value);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::addcdiv_(
+    Tensor& self,
+    const Tensor& tensor1,
+    const Tensor& tensor2,
+    const Scalar& value) {
+  auto iter = addcdiv_meta(self, tensor1, tensor2, value, self);
+  native::xpu::addcdiv_kernel(iter, value);
+  return self;
+}
+
 TensorIterator addcmul_meta(
     const Tensor& self,
     const Tensor& tensor1,
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index d7b79902f..110590958 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -1,8 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorFactories.h>
-#include <c10/xpu/XPUFunctions.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
+#include <c10/xpu/XPUFunctions.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -18,6 +18,25 @@
 
 namespace at {
 
+Tensor& XPUNativeFunctions::eye_out(int64_t n, Tensor& result) {
+  return XPUNativeFunctions::eye_out(n, n, result);
+}
+
+Tensor& XPUNativeFunctions::eye_out(int64_t n, int64_t m, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  int64_t stride = result.stride(0) + result.stride(1);
+
+  Tensor diag = result.as_strided({sz}, {stride});
+  diag.fill_(1);
+  return result;
+}
+
 Tensor XPUNativeFunctions::empty(
     IntArrayRef size,
     c10::optional<ScalarType> dtype_opt,
diff --git a/src/ATen/native/xpu/TensorProperties.cpp b/src/ATen/native/xpu/TensorProperties.cpp
new file mode 100644
index 000000000..428d18fcd
--- /dev/null
+++ b/src/ATen/native/xpu/TensorProperties.cpp
@@ -0,0 +1,16 @@
+#include <ATen/xpu/XPUNativeFunctions.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/is_set_to_native.h>
+#endif
+
+namespace at {
+
+bool XPUNativeFunctions::is_set_to(const Tensor& self, const Tensor& src) {
+  return at::native::is_set_to(self, src);
+}
+
+} // namespace at
diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index 419af3b8f..581ec0d54 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -5,6 +5,7 @@
 #include <ATen/xpu/XPUNativeFunctions.h>
 
 #include <ATen/native/xpu/sycl/AbsKernel.h>
+#include <ATen/native/xpu/sycl/UnaryComplexKernels.h>
 #include <ATen/native/xpu/sycl/UnaryFractionKernels.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h>
 #include <ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h>
@@ -561,4 +562,52 @@ Tensor& XPUNativeFunctions::tan_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor& XPUNativeFunctions::conj_physical_out(const Tensor& self, Tensor& out) {
+  auto iter = TensorIterator::unary_op(out, self);
+  native::xpu::conj_physical_kernel(iter);
+  return out;
+}
+
+Tensor& XPUNativeFunctions::conj_physical_(Tensor& self) {
+  if (!self.is_complex())
+    return self;
+  return XPUNativeFunctions::conj_physical_out(self, self);
+}
+
+TensorIterator ceil_meta(const Tensor& self, Tensor& out) {
+  TORCH_CHECK(!self.is_complex(), "ceil is not supported for complex inputs");
+  TensorIterator iter;
+  iter.build_borrowing_unary_op(out, self);
+  return iter;
+}
+
+Tensor XPUNativeFunctions::ceil(const Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self.clone();
+  }
+  Tensor out;
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::ceil_(Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    return self;
+  }
+  auto iter = ceil_meta(self, self);
+  native::xpu::ceil_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) {
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
+    out.copy_(self);
+    return out;
+  }
+  auto iter = ceil_meta(self, out);
+  native::xpu::ceil_kernel(iter);
+  return out;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 3c1d5b5d1..60af8f15e 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -163,7 +163,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "adaptive_max_pool2d.out",
     "adaptive_max_pool3d_backward.grad_input",
     "adaptive_max_pool3d.out",
-    "addcdiv.out",
     "aminmax.out",
     "angle",
     "argmin.out",
@@ -178,15 +177,12 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "binary_cross_entropy_backward",
     "bitwise_left_shift.Tensor_out",
     "bitwise_right_shift.Tensor_out",
-    "bucketize.Tensor",
     "cauchy_",
     "_cdist_backward",
-    "ceil.out",
     "channel_shuffle",
     "cholesky",
     "cholesky_inverse",
     "_cholesky_solve_helper",
-    "conj_physical.out",
     "copysign.out",
     "cosh.out",
     "count_nonzero.dim_IntList",
@@ -205,7 +201,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "exp2.out",
     "expm1.out",
     "exponential_",
-    "eye.m_out",
     "_fft_c2c",
     "_fft_c2r",
     "_fft_r2c",
@@ -322,7 +317,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "__rshift__.Scalar",
     "_scaled_dot_product_efficient_attention",
     "_scaled_mm",
-    "searchsorted.Tensor",
     "segment_reduce",
     "_segment_reduce_backward",
     "signbit.out",
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.cpp b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
new file mode 100644
index 000000000..213283de0
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.cpp
@@ -0,0 +1,246 @@
+#include <ATen/ATen.h>
+#include <ATen/native/BucketizationUtils.h>
+#include <comm/SYCLContext.h>
+
+namespace at::native::xpu {
+
+// customized lower_bound func to ensure the low bound of 'nan', 'inf' etc. be
+// the end of boundary and we can properly handle a sorter argument
+// std::lower_bound can not be used here since its customized comparator need
+// strict weak ordering and the customized comparators require both arguments to
+// have the same type, which wouldn't happen when comparing val of input_t to an
+// indexer value from sorter of int64
+template <typename input_t>
+int64_t cus_lower_bound(
+    int64_t start,
+    int64_t end,
+    const input_t val,
+    const input_t* bd,
+    const int64_t* sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add
+  // the non-updated start as an offset i.e. the second row of a 3x3 tensors
+  // starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid];
+    if (!(mid_val >= val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+// customized upper_bound func to ensure we can properly handle a sorter
+// argument std::upper_bound can not be used here since its customized
+// comparator requires both arguments to have the same type, which wouldn't
+// happen when comparing val of input_t to an indexer value from sorter of int64
+template <typename input_t>
+int64_t cus_upper_bound(
+    int64_t start,
+    int64_t end,
+    const input_t val,
+    const input_t* bd,
+    const int64_t* sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add
+  // the non-updated start as an offset i.e. the second row of a 3x3 tensors
+  // starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = sort ? bd[sort[mid] + orig_start] : bd[mid];
+    if (!(mid_val > val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename input_t, typename output_t>
+struct SearchsortedKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    for (int64_t i = item.get_global_id(0); i < numel_in_;
+         i += item.get_global_range()[0]) {
+      // If boundaries tensor is 1d, we always search the entire boundary
+      // tensor
+      int64_t start_bd = is_1d_boundaries_ ? 0 : i / idim_in_ * idim_bd_;
+      int64_t end_bd = start_bd + idim_bd_;
+
+      int64_t pos = !right_
+          ? cus_lower_bound(
+                start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) -
+              start_bd
+          : cus_upper_bound(
+                start_bd, end_bd, data_in_data_[i], data_bd_data_, data_st_) -
+              start_bd;
+
+      // type conversion might happen here
+      data_out_data_[i] = pos;
+    }
+  }
+
+  SearchsortedKernelFunctor(
+      const bool right,
+      int64_t numel_in,
+      int64_t idim_in,
+      int64_t idim_bd,
+      const int64_t* data_st,
+      output_t* data_out,
+      bool is_1d_boundaries,
+      input_t* data_in_data,
+      input_t* data_bd_data,
+      output_t* data_out_data)
+      : right_(right),
+        numel_in_(numel_in),
+        idim_in_(idim_in),
+        idim_bd_(idim_bd),
+        data_st_(data_st),
+        data_out_(data_out),
+        is_1d_boundaries_(is_1d_boundaries),
+        data_in_data_(data_in_data),
+        data_bd_data_(data_bd_data),
+        data_out_data_(data_out_data) {}
+
+ private:
+  const bool right_;
+  int64_t numel_in_;
+  int64_t idim_in_;
+  int64_t idim_bd_;
+  const int64_t* data_st_;
+  output_t* data_out_;
+  bool is_1d_boundaries_;
+  input_t* data_in_data_;
+  input_t* data_bd_data_;
+  output_t* data_out_data_;
+};
+template <typename input_t, typename output_t>
+void searchsorted_template(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& boundaries,
+    const bool& right,
+    const Tensor& sorter) {
+  int64_t numel_in = input.numel();
+  int64_t rng, grng, tile_size;
+  tile_size = syclMaxWorkGroupSize();
+  rng = numel_in;
+  if (rng == 0) {
+    rng = static_cast<int64_t>(1);
+  }
+
+  grng = rng;
+  if (tile_size > grng) {
+    tile_size = grng;
+  } else if (grng > tile_size) {
+    int64_t xMode = static_cast<int64_t>(grng % tile_size);
+    if (xMode != 0) {
+      grng += static_cast<int64_t>(tile_size - xMode);
+    }
+  }
+
+  bool is_scalar_input = input.dim() == 0 && numel_in == 1;
+  // inner most dim size of input and boundaries
+  int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
+  int64_t idim_bd = boundaries.sizes().back();
+
+  const int64_t* data_st =
+      sorter.defined() ? sorter.data_ptr<int64_t>() : nullptr;
+  output_t* data_out = result.data_ptr<output_t>();
+
+  bool is_1d_boundaries = boundaries.dim() == 1;
+  auto data_in_data = input.data_ptr<input_t>();
+  auto data_bd_data = boundaries.data_ptr<input_t>();
+  auto data_out_data = result.data_ptr<output_t>();
+  SearchsortedKernelFunctor<input_t, output_t> kfn(
+      right,
+      numel_in,
+      idim_in,
+      idim_bd,
+      data_st,
+      data_out,
+      is_1d_boundaries,
+      data_in_data,
+      data_bd_data,
+      data_out_data);
+
+  sycl_kernel_submit(grng, tile_size, getCurrentSYCLQueue(), kfn);
+}
+
+void searchsorted_dispatch(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter) {
+  if (!out_int32) {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_xpu",
+        [&] {
+          searchsorted_template<scalar_t, int64_t>(
+              result, input, boundaries, right, sorter);
+        });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "searchsorted_xpu",
+        [&] {
+          searchsorted_template<scalar_t, int>(
+              result, input, boundaries, right, sorter);
+        });
+  }
+}
+
+void searchsorted_kernel(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& sorted_sequence,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter) {
+  // for non-contiguous result tensors, we write the output to a contiguous copy
+  // so we can later copy back, maintaining the original result tensor
+  Tensor out = result;
+  if (!result.is_contiguous()) {
+    out = result.contiguous();
+  }
+  if (sorted_sequence.is_contiguous() && input.is_contiguous() &&
+      sorted_sequence.dtype() == input.dtype() && sorter.is_contiguous()) {
+    searchsorted_dispatch(
+        out, input, sorted_sequence, out_int32, right, sorter);
+  } else {
+    Tensor trimmed_input;
+    Tensor trimmed_boundaries;
+    Tensor trimmed_sorter;
+    at::native::searchsorted_maybe_trim_input_tensors(
+        trimmed_input,
+        trimmed_boundaries,
+        trimmed_sorter,
+        input,
+        sorted_sequence,
+        sorter);
+    const Tensor& final_input = trimmed_input.defined() ? trimmed_input : input;
+    const Tensor& final_boundaries =
+        trimmed_boundaries.defined() ? trimmed_boundaries : sorted_sequence;
+    const Tensor& final_sorter =
+        trimmed_sorter.defined() ? trimmed_sorter : sorter;
+    searchsorted_dispatch(
+        out, final_input, final_boundaries, out_int32, right, final_sorter);
+  }
+
+  // if result is non-contiguous, we wrote the answer to a copied version, so we
+  // copy back to the original result tensor
+  if (!result.is_contiguous()) {
+    result.copy_(out);
+  }
+}
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/BucketizationKernels.h b/src/ATen/native/xpu/sycl/BucketizationKernels.h
new file mode 100644
index 000000000..f47cea2af
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/BucketizationKernels.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <ATen/ATen.h>
+
+namespace at::native::xpu {
+void searchsorted_kernel(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& sorted_sequence,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter);
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/src/ATen/native/xpu/sycl/ComplexKernels.cpp b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
index 686f5e2d3..56b25d0ef 100644
--- a/src/ATen/native/xpu/sycl/ComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/ComplexKernels.cpp
@@ -14,10 +14,11 @@ struct ComplexFunctor {
 };
 
 void complex_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_xpu", [&]() {
-    ComplexFunctor<scalar_t> f;
-    gpu_kernel(iter, f);
-  });
+  AT_DISPATCH_FLOATING_TYPES_AND(
+      kHalf, iter.input_dtype(0), "complex_xpu", [&]() {
+        ComplexFunctor<scalar_t> f;
+        gpu_kernel(iter, f);
+      });
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
index 6e117ce61..8d3128e9e 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.cpp
@@ -1,7 +1,4 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
-#include <c10/core/ScalarType.h>
-
 #include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
diff --git a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
index fcfdad46e..32f987a2e 100644
--- a/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
+++ b/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <ATen/native/TensorIterator.h>
-#include <c10/core/ScalarType.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
index 7b00d09e3..d38f511d7 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/OpMathType.h>
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/native/xpu/sycl/Loops.h>
@@ -8,31 +8,98 @@
 namespace at::native::xpu {
 
 template <typename scalar_t>
-struct AddcmulKernelFunctor {
-  using opmath_t = at::opmath_type<scalar_t>;
+struct AddcmulFunctor {
+  using accscalar_t = at::acc_type<scalar_t, true>;
   scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
-    return static_cast<opmath_t>(a) +
-        alpha_ * static_cast<opmath_t>(b) * static_cast<opmath_t>(c);
+    return static_cast<accscalar_t>(a) +
+        alpha_ * static_cast<accscalar_t>(b) * static_cast<accscalar_t>(c);
   }
 
-  AddcmulKernelFunctor(opmath_t alpha) : alpha_(alpha) {}
+  AddcmulFunctor(accscalar_t alpha) : alpha_(alpha) {}
 
  private:
-  opmath_t alpha_;
+  accscalar_t alpha_;
+};
+
+template <typename scalar_t>
+struct AddcmulComplexFunctor {
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * b * c;
+  }
+
+  AddcmulComplexFunctor(scalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  scalar_t alpha_;
 };
 
 void addcmul_kernel(TensorIterator& iter, Scalar value) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      iter.dtype(),
-      "addcmul_xpu",
-      [&]() {
-        using opmath_t = at::opmath_type<scalar_t>;
-        auto alpha = value.to<opmath_t>();
-        AddcmulKernelFunctor<scalar_t> f(alpha);
-        gpu_kernel(iter, f);
-      });
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_xpu", [&]() {
+      auto alpha = value.to<scalar_t>();
+      gpu_kernel(iter, AddcmulComplexFunctor<scalar_t>(alpha));
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        iter.dtype(),
+        "addcmul_xpu",
+        [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto alpha = value.to<accscalar_t>();
+          gpu_kernel(iter, AddcmulFunctor<scalar_t>(alpha));
+        });
+  }
+}
+
+template <typename scalar_t>
+struct AddcdivFunctor {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * (b / static_cast<accscalar_t>(c));
+  }
+
+  AddcdivFunctor(accscalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  accscalar_t alpha_;
+};
+
+template <typename scalar_t>
+struct AddcdivComplexFunctor {
+  scalar_t operator()(scalar_t a, scalar_t b, scalar_t c) const {
+    return a + alpha_ * (b / c);
+  }
+
+  AddcdivComplexFunctor(scalar_t alpha) : alpha_(alpha) {}
+
+ private:
+  scalar_t alpha_;
+};
+
+void addcdiv_kernel(TensorIterator& iter, Scalar value) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_xpu", [&]() {
+      auto alpha = value.to<scalar_t>();
+      AddcdivComplexFunctor<scalar_t> f(alpha);
+      gpu_kernel(iter, f);
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        iter.dtype(),
+        "addcdiv_xpu",
+        [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto alpha = value.to<accscalar_t>();
+          AddcdivFunctor<scalar_t> f(alpha);
+          gpu_kernel(iter, f);
+        });
+  }
 }
 
 template <typename scalar_t>
diff --git a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
index fdb216dbd..c775b88e5 100644
--- a/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h
@@ -6,6 +6,8 @@ namespace at::native::xpu {
 
 void addcmul_kernel(TensorIterator& iter, Scalar value);
 
+void addcdiv_kernel(TensorIterator& iter, Scalar value);
+
 void mse_backward_kernel(TensorIterator& iter, const Scalar& value);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
index e082096c1..87de57a3a 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.cpp
@@ -30,6 +30,32 @@ void conj_kernel(TensorIterator& iter) {
       }));
 }
 
+template <typename scalar_t>
+struct ConjPhysicalFunctor {
+  scalar_t operator()(scalar_t z) const {
+    return std::conj(z);
+  }
+};
+
+template <typename TYPE>
+struct ConjPhysicalFunctor<c10::complex<TYPE>> {
+  c10::complex<TYPE> operator()(c10::complex<TYPE> z) const {
+    return c10::complex<TYPE>(z.real(), -z.imag());
+  }
+};
+
+void conj_physical_kernel(TensorIterator& iter) {
+  AT_DISPATCH_SWITCH(
+      iter.common_dtype(),
+      "conj_xpu",
+      AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] {
+        // Conj is a no-op for non-complex types
+        copy_kernel(iter);
+      }) AT_DISPATCH_CASE_COMPLEX_TYPES_AND(kComplexHalf, [&] {
+        gpu_kernel(iter, ConjPhysicalFunctor<scalar_t>());
+      }));
+}
+
 template <typename scalar_t>
 struct NegConjScalarFunc {
   scalar_t operator()(scalar_t src_val) const {
diff --git a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
index 8d19381b3..d3ad4fe15 100644
--- a/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryComplexKernels.h
@@ -6,6 +6,8 @@ namespace at::native::xpu {
 
 void conj_kernel(TensorIterator& iter);
 
+void conj_physical_kernel(TensorIterator& iter);
+
 void neg_conj_kernel(TensorIterator& iter);
 
 void neg_kernel(TensorIterator& iter);
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
index b33be1a30..82bdc4c28 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.cpp
@@ -55,4 +55,25 @@ void reciprocal_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, ReciprocalFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct CeilFunctor {
+  scalar_t operator()(const scalar_t a) const {
+    return std::ceil(a);
+  }
+};
+
+template <typename T>
+struct CeilFunctor<c10::complex<T>> {
+  c10::complex<T> operator()(const c10::complex<T> a) const {
+    return c10::complex<T>(std::ceil(a.real()), std::ceil(a.imag()));
+  }
+};
+
+void ceil_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "ceil_xpu", [&]() {
+        gpu_kernel(iter, CeilFunctor<scalar_t>());
+      });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
index 5de711c78..7ab9baf32 100644
--- a/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryFractionKernels.h
@@ -6,4 +6,6 @@ namespace at::native::xpu {
 
 void reciprocal_kernel(TensorIteratorBase& iter);
 
+void ceil_kernel(TensorIteratorBase& iter);
+
 } // namespace at::native::xpu
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index f4bf64aad..2404fdb9b 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -207,7 +207,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_torch_fallback__refs_square_xpu_bool",
     "test_python_ref_torch_fallback__refs_vdot_xpu_complex128",
     "test_python_ref_torch_fallback__refs_vdot_xpu_complex64",
-    "test_variant_consistency_eager_conj_physical_xpu_complex64",
     "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64",
     "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32",
     "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
@@ -242,8 +241,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_executor__refs_square_executor_aten_xpu_complex128",
     "test_python_ref_torch_fallback__refs_square_xpu_complex128",
     "test_python_ref_torch_fallback__refs_square_xpu_complex64",
-    "test_conj_view_conj_physical_xpu_complex64",
-    "test_neg_conj_view_conj_physical_xpu_complex128",
     # Skip list of new added when porting XPU operators.
     # See: https://github.com/intel/torch-xpu-ops/issues/128
 
@@ -2215,9 +2212,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
     "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64",
     "test_forward_mode_AD_nn_functional_rrelu_xpu_float64",
-    # RuntimeError: DispatchStub: unsupported device typexpu
-    "test_inplace_forward_mode_AD_conj_physical_xpu_complex128",
-    # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
+# NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
     "test_fn_fwgrad_bwgrad_to_sparse_xpu_float64",
     "test_forward_mode_AD_to_sparse_xpu_float64",
 )
@@ -2753,9 +2748,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     ### Error #7 in TestBwdGradientsXPU , totally 2 , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [XPU, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].
     "test_fn_grad_to_sparse_xpu_float64",
     "test_fn_gradgrad_to_sparse_xpu_float64",
-    ### Error #8 in TestBwdGradientsXPU , totally 2 , RuntimeError: DispatchStub: unsupported device typexpu
-    "test_inplace_grad_conj_physical_xpu_complex128",
-    "test_inplace_gradgrad_conj_physical_xpu_complex128",
 )
 res += launch_test("test_ops_gradients_xpu.py", skip_list)
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index befd27df1..1228bd607 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -19,6 +19,7 @@
 
 _xpu_computation_op_list = [
     "empty",
+    "eye",
     "fill",
     "zeros",
     "zeros_like",
@@ -43,6 +44,7 @@
     "bitwise_or",
     "bitwise_xor",
     "addcmul",
+    "addcdiv",
     "clamp",
     "clamp_max",
     "clamp_min",
@@ -140,6 +142,8 @@
     "sigmoid",
     "sgn",
     "nn.functional.embedding_bag",
+    "bucketize",
+    "searchsorted",
     "grid_sampler_2d",
     # "nn.functional.grid_sample", # Lack of XPU implementation of aten::grid_sampler_3d.
     "acos",
@@ -154,6 +158,7 @@
     "bincount",
     "renorm",
     "lerp",
+    "conj_physical",
 ]
 
 
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 13509c591..f88f3108c 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -196,6 +196,8 @@ supported:
   - exp_
   - empty.memory_format
   - empty_strided
+  - eye.out
+  - eye.m_out
   - _efficientzerotensor
   - complex.out
   - clone
@@ -459,6 +461,14 @@ supported:
   - _cdist_forward
   - _pin_memory
   - is_pinned
+  - is_set_to
+  - bucketize.Tensor
+  - bucketize.Tensor_out
+  - bucketize.Scalar
+  - searchsorted.Tensor
+  - searchsorted.Tensor_out
+  - searchsorted.Scalar
+  - searchsorted.Scalar_out
   - trace
   - reflection_pad2d
   - reflection_pad2d.out
@@ -493,9 +503,17 @@ supported:
   - avg_pool2d.out
   - avg_pool2d_backward
   - avg_pool2d_backward.grad_input
+  - addcdiv.out
+  - addcdiv
+  - addcdiv_
   - addcmul.out
   - addcmul
   - addcmul_
   - randperm.generator_out
   - _amp_foreach_non_finite_check_and_unscale_
   - _amp_update_scale_
+  - conj_physical.out
+  - conj_physical_
+  - ceil
+  - ceil_
+  - ceil.out