diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
index ab7d7812d..56d8e3930 100644
--- a/.github/scripts/env.sh
+++ b/.github/scripts/env.sh
@@ -1,3 +1,4 @@
 #!/bin/bash
-source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+source /opt/intel/oneapi/compiler/latest/env/vars.sh
+source /opt/intel/oneapi/umf/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 45fdef513..fd3e25bb0 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -24,6 +24,7 @@ jobs:
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     uses: ./.github/workflows/_linux_ut.yml
     with: 
+      pytorch: release/2.5
       ut: op_example,op_extended,op_ut
       runner: linux.idc.xpu
 
@@ -32,6 +33,7 @@ jobs:
     if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
     uses: ./.github/workflows/_linux_ut.yml
     with: 
+      pytorch: release/2.5
       abi: 0
       ut: op_extended
       runner: linux.idc.xpu
@@ -57,7 +59,7 @@ jobs:
           pwd
           cd ../ && rm -rf pytorch
           source activate e2e_ci
-          git clone -b main https://github.com/pytorch/pytorch pytorch
+          git clone -b release/2.5 https://github.com/pytorch/pytorch pytorch
           cd pytorch
           # apply PRs for stock pytorch
           pip install requests
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
index f85598b07..c5d3e5d46 100644
--- a/cmake/BuildFlags.cmake
+++ b/cmake/BuildFlags.cmake
@@ -40,6 +40,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     list(APPEND SYCL_HOST_FLAGS -Wno-deprecated)
     list(APPEND SYCL_HOST_FLAGS -Wno-attributes)
     list(APPEND SYCL_HOST_FLAGS -Wno-sign-compare)
+    list(APPEND SYCL_HOST_FLAGS -Wno-error=comment)
+    list(APPEND SYCL_HOST_FLAGS -Wno-error=terminate)
   endif()
 
   if(CMAKE_BUILD_TYPE MATCHES Debug)
@@ -47,6 +49,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     list(APPEND SYCL_HOST_FLAGS -O0)
   endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+  list(APPEND SYCL_HOST_FLAGS -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
   # -- Kernel flags (SYCL_KERNEL_OPTIONS)
   # The fast-math will be enabled by default in SYCL compiler.
   # Refer to [https://clang.llvm.org/docs/UsersManual.html#cmdoption-fno-fast-math]
@@ -81,10 +84,13 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -fno-approx-func)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-absolute-value)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -no-ftz)
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-error=comment)
+    set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -Wno-error=terminate)
     # Equivalent to build option -fpreview-breaking-changes for SYCL compiler.
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_PREVIEW_BREAKING_CHANGES)
     set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI})
   endif()
+  set(SYCL_KERNEL_OPTIONS ${SYCL_KERNEL_OPTIONS} -D__INTEL_LLVM_COMPILER_VERSION=${__INTEL_LLVM_COMPILER})
 
   CHECK_SYCL_FLAG("-fsycl-fp64-conv-emu" SUPPORTS_FP64_CONV_EMU)
   if(SUPPORTS_FP64_CONV_EMU)
diff --git a/cmake/Modules/FindSYCLToolkit.cmake b/cmake/Modules/FindSYCLToolkit.cmake
index 46e34c7f8..88e5768c4 100644
--- a/cmake/Modules/FindSYCLToolkit.cmake
+++ b/cmake/Modules/FindSYCLToolkit.cmake
@@ -77,7 +77,7 @@ endif()
 
 # Function to write a test case to verify SYCL features.
 
-function(SYCL_CMPLR_TEST_WRITE src)
+function(SYCL_CMPLR_TEST_WRITE src macro_name)
 
   set(cpp_macro_if "#if")
   set(cpp_macro_endif "#endif")
@@ -88,8 +88,8 @@ function(SYCL_CMPLR_TEST_WRITE src)
 
   # Feature tests goes here
 
-  string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(SYCL_LANGUAGE_VERSION)\n")
-  string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"SYCL_LANGUAGE_VERSION=\"<<SYCL_LANGUAGE_VERSION<<endl;\n")
+  string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_if} defined(${macro_name})\n")
+  string(APPEND SYCL_CMPLR_TEST_CONTENT "cout << \"${macro_name}=\"<<${macro_name}<<endl;\n")
   string(APPEND SYCL_CMPLR_TEST_CONTENT "${cpp_macro_endif}\n")
 
   string(APPEND SYCL_CMPLR_TEST_CONTENT "return 0;}\n")
@@ -103,6 +103,7 @@ endfunction()
 function(SYCL_CMPLR_TEST_BUILD error TEST_SRC_FILE TEST_EXE)
 
   set(SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS}")
+  string(REPLACE "-Wno-stringop-overflow" "" SYCL_CXX_FLAGS_LIST "${SYCL_CXX_FLAGS_LIST}")
   separate_arguments(SYCL_CXX_FLAGS_LIST)
 
   execute_process(
@@ -150,19 +151,19 @@ function(SYCL_CMPLR_TEST_RUN error TEST_EXE)
 
 endfunction()
 
-function(SYCL_CMPLR_TEST_EXTRACT test_output)
+function(SYCL_CMPLR_TEST_EXTRACT test_output macro_name)
 
   string(REGEX REPLACE "\n" ";" test_output_list "${test_output}")
 
-  set(SYCL_LANGUAGE_VERSION "")
+  set(${macro_name} "")
   foreach(strl ${test_output_list})
-     if(${strl} MATCHES "^SYCL_LANGUAGE_VERSION=([A-Za-z0-9_]+)$")
-       string(REGEX REPLACE "^SYCL_LANGUAGE_VERSION=" "" extracted_sycl_lang "${strl}")
-       set(SYCL_LANGUAGE_VERSION ${extracted_sycl_lang})
+     if(${strl} MATCHES "^${macro_name}=([A-Za-z0-9_]+)$")
+       string(REGEX REPLACE "^${macro_name}=" "" extracted_sycl_lang "${strl}")
+       set(${macro_name} ${extracted_sycl_lang})
      endif()
   endforeach()
 
-  set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" PARENT_SCOPE)
+  set(${macro_name} "${extracted_sycl_lang}" PARENT_SCOPE)
 endfunction()
 
 set(SYCL_FLAGS "")
@@ -189,7 +190,7 @@ if(${has_werror} EQUAL -1)
   # Create the test source file
   set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/sycl_features.cpp")
   set(TEST_EXE "${TEST_SRC_FILE}.exe")
-  SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE})
+  SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "SYCL_LANGUAGE_VERSION")
 
   # Build the test and create test executable
   SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
@@ -204,7 +205,7 @@ if(${has_werror} EQUAL -1)
   endif()
 
   # Extract test output for information
-  SYCL_CMPLR_TEST_EXTRACT(${test_output})
+  SYCL_CMPLR_TEST_EXTRACT(${test_output} "SYCL_LANGUAGE_VERSION")
 
   # As per specification, all the SYCL compatible compilers should
   # define macro  SYCL_LANGUAGE_VERSION
@@ -221,5 +222,38 @@ if(${has_werror} EQUAL -1)
   set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version")
 endif()
 
+# Create a clean working directory.
+set(SYCL_CMPLR_TEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/TESTSYCLCMPLR")
+file(REMOVE_RECURSE ${SYCL_CMPLR_TEST_DIR})
+file(MAKE_DIRECTORY ${SYCL_CMPLR_TEST_DIR})
+# Create the test source file
+set(TEST_SRC_FILE "${SYCL_CMPLR_TEST_DIR}/llvm_features.cpp")
+set(TEST_EXE "${TEST_SRC_FILE}.exe")
+SYCL_CMPLR_TEST_WRITE(${TEST_SRC_FILE} "__INTEL_LLVM_COMPILER")
+# Build the test and create test executable
+SYCL_CMPLR_TEST_BUILD(error ${TEST_SRC_FILE} ${TEST_EXE})
+if(error)
+  message(FATAL_ERROR "Can not build SYCL_CMPLR_TEST")
+endif()
+# Execute the test to extract information
+SYCL_CMPLR_TEST_RUN(error ${TEST_EXE})
+if(error)
+  message(FATAL_ERROR "Can not run SYCL_CMPLR_TEST")
+endif()
+# Extract test output for information
+SYCL_CMPLR_TEST_EXTRACT(${test_output} "__INTEL_LLVM_COMPILER")
+
+# Check whether the value of __INTEL_LLVM_COMPILER macro was successfully extracted
+string(COMPARE EQUAL "${__INTEL_LLVM_COMPILER}" "" nosycllang)
+if(nosycllang)
+  set(SYCLTOOLKIT_FOUND False)
+  set(SYCL_REASON_FAILURE "Can not find __INTEL_LLVM_COMPILER}")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+endif()
+
+
+# Include in Cache
+set(__INTEL_LLVM_COMPILER "${__INTEL_LLVM_COMPILER}" CACHE STRING "Intel llvm compiler")
+
 message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}")
 message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}")
diff --git a/src/comm/SYCLHelpers.h b/src/comm/SYCLHelpers.h
index aa8314390..e517e3cbc 100644
--- a/src/comm/SYCLHelpers.h
+++ b/src/comm/SYCLHelpers.h
@@ -50,10 +50,15 @@ static inline void sycl_kernel_submit(
     ::sycl::range<dim> range,
     ::sycl::queue q,
     ker_t ker) {
+#if SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS == 1
   auto cgf = [&](::sycl::handler& cgh) {
     ::sycl::ext::oneapi::experimental::parallel_for<ker_t>(cgh, range, ker);
   };
   ::sycl::ext::oneapi::experimental::submit(q, cgf);
+#else
+  auto cgf = [&](::sycl::handler& cgh) { cgh.parallel_for<ker_t>(range, ker); };
+  q.submit(cgf);
+#endif
 }
 
 // Additional convention of SYCL kernel configuration. Besides construct kernel
@@ -80,12 +85,21 @@ sycl_kernel_submit(
     ::sycl::range<dim> local_range,
     ::sycl::queue q,
     ker_t ker) {
+#if SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS == 1
   auto cgf = [&](::sycl::handler& cgh) {
     ker.sycl_ker_config_convention(cgh);
     ::sycl::ext::oneapi::experimental::nd_launch<ker_t>(
         cgh, ::sycl::nd_range<dim>(global_range, local_range), ker);
   };
   ::sycl::ext::oneapi::experimental::submit(q, cgf);
+#else
+  auto cgf = [&](::sycl::handler& cgh) {
+    ker.sycl_ker_config_convention(cgh);
+    cgh.parallel_for<ker_t>(
+        ::sycl::nd_range<dim>(global_range, local_range), ker);
+  };
+  q.submit(cgf);
+#endif
 }
 
 template <typename ker_t, int dim>
@@ -97,11 +111,19 @@ sycl_kernel_submit(
     ::sycl::range<dim> local_range,
     ::sycl::queue q,
     ker_t ker) {
+#if SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS == 1
   auto cgf = [&](::sycl::handler& cgh) {
     ::sycl::ext::oneapi::experimental::nd_launch<ker_t>(
         cgh, ::sycl::nd_range<dim>(global_range, local_range), ker);
   };
   ::sycl::ext::oneapi::experimental::submit(q, cgf);
+#else
+  auto cgf = [&](::sycl::handler& cgh) {
+    cgh.parallel_for<ker_t>(
+        ::sycl::nd_range<dim>(global_range, local_range), ker);
+  };
+  q.submit(cgf);
+#endif
 }
 
 template <typename ker_t>
@@ -113,6 +135,7 @@ sycl_kernel_submit(
     int64_t local_range,
     ::sycl::queue q,
     ker_t ker) {
+#if SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS == 1
   auto cgf = [&](::sycl::handler& cgh) {
     ker.sycl_ker_config_convention(cgh);
     ::sycl::ext::oneapi::experimental::nd_launch<ker_t>(
@@ -122,6 +145,16 @@ sycl_kernel_submit(
         ker);
   };
   ::sycl::ext::oneapi::experimental::submit(q, cgf);
+#else
+  auto cgf = [&](::sycl::handler& cgh) {
+    ker.sycl_ker_config_convention(cgh);
+    cgh.parallel_for<ker_t>(
+        ::sycl::nd_range<1>(
+            ::sycl::range<1>(global_range), ::sycl::range<1>(local_range)),
+        ker);
+  };
+  q.submit(cgf);
+#endif
 }
 
 template <typename ker_t>
@@ -133,6 +166,7 @@ sycl_kernel_submit(
     int64_t local_range,
     ::sycl::queue q,
     ker_t ker) {
+#if SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS == 1
   auto cgf = [&](::sycl::handler& cgh) {
     ::sycl::ext::oneapi::experimental::nd_launch<ker_t>(
         cgh,
@@ -141,4 +175,13 @@ sycl_kernel_submit(
         ker);
   };
   ::sycl::ext::oneapi::experimental::submit(q, cgf);
+#else
+  auto cgf = [&](::sycl::handler& cgh) {
+    cgh.parallel_for<ker_t>(
+        ::sycl::nd_range<1>(
+            ::sycl::range<1>(global_range), ::sycl::range<1>(local_range)),
+        ker);
+  };
+  q.submit(cgf);
+#endif
 }
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 01649588f..eba71b17f 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -148,6 +148,9 @@
     "test_compare_cpu_nanmedian_xpu_int64",
     "test_compare_cpu_nanmedian_xpu_int8",
     "test_compare_cpu_nanmedian_xpu_uint8",
+    "test_compare_cpu_nn_functional_unfold_xpu_bool",
+    "test_non_standard_bool_values_nn_functional_unfold_xpu_bool",
+    "test_non_standard_bool_values_index_put_xpu_bool",
 )
 
 
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 0d0f18a86..2b9235efc 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -1,5 +1,7 @@
 skip_dict = {
     "test_ops_xpu.py": (
+        "test_noncontiguous_samples_histogram_xpu_float32",
+        "test_non_standard_bool_values_index_put_xpu_bool",
         # Skip list of base line
         "test_dtypes___rmod___xpu",
         "test_dtypes_nn_functional_conv1d_xpu",
@@ -1250,6 +1252,12 @@
     ),
 
     "test_unary_ufuncs_xpu.py": (
+        "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
+        "test_exp_xpu_complex64",
+        "test_reference_numerics_extremal__refs_exp_xpu_complex64",
+        "test_reference_numerics_extremal_exp2_xpu_complex64",
+        "test_reference_numerics_extremal_exp_xpu_complex64",
+        "test_reference_numerics_large_exp_xpu_complex32",
         # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
         "_jiterator_",
         # CPU Fallback fails: Tensor-likes are not close!
@@ -2153,314 +2161,6 @@
         "test_to and not test_to_memory and not test_total",
     ),
 
-    "test_ops_gradients_xpu.py": (
-        ### Error #0 in TestBwdGradientsXPU , totally 271 , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
-        "test_fn_grad___rmatmul___xpu_complex128",
-        "test_fn_grad___rmatmul___xpu_float64",
-        "test_fn_grad_addbmm_xpu_float64",
-        "test_fn_grad_addmm_decomposed_xpu_complex128",
-        "test_fn_grad_addmm_decomposed_xpu_float64",
-        "test_fn_grad_addmm_xpu_complex128",
-        "test_fn_grad_addmm_xpu_float64",
-        "test_fn_grad_addmv_xpu_complex128",
-        "test_fn_grad_addmv_xpu_float64",
-        "test_fn_grad_addr_xpu_complex128",
-        "test_fn_grad_addr_xpu_float64",
-        "test_fn_grad_baddbmm_xpu_complex128",
-        "test_fn_grad_baddbmm_xpu_float64",
-        "test_fn_grad_bmm_xpu_complex128",
-        "test_fn_grad_bmm_xpu_float64",
-        "test_fn_grad_cdist_xpu_float64",
-        "test_fn_grad_cholesky_inverse_xpu_complex128",
-        "test_fn_grad_cholesky_inverse_xpu_float64",
-        "test_fn_grad_cholesky_solve_xpu_complex128",
-        "test_fn_grad_cholesky_solve_xpu_float64",
-        "test_fn_grad_cholesky_xpu_complex128",
-        "test_fn_grad_cholesky_xpu_float64",
-        "test_fn_grad_corrcoef_xpu_complex128",
-        "test_fn_grad_corrcoef_xpu_float64",
-        "test_fn_grad_einsum_xpu_complex128",
-        "test_fn_grad_einsum_xpu_float64",
-        "test_fn_grad_inner_xpu_complex128",
-        "test_fn_grad_inner_xpu_float64",
-        "test_fn_grad_linalg_cholesky_ex_xpu_complex128",
-        "test_fn_grad_linalg_cholesky_ex_xpu_float64",
-        "test_fn_grad_linalg_cholesky_xpu_complex128",
-        "test_fn_grad_linalg_cholesky_xpu_float64",
-        "test_fn_grad_linalg_cond_xpu_complex128",
-        "test_fn_grad_linalg_cond_xpu_float64",
-        "test_fn_grad_linalg_det_singular_xpu_complex128",
-        "test_fn_grad_linalg_det_singular_xpu_float64",
-        "test_fn_grad_linalg_det_xpu_complex128",
-        "test_fn_grad_linalg_det_xpu_float64",
-        "test_fn_grad_linalg_eig_xpu_complex128",
-        "test_fn_grad_linalg_eig_xpu_float64",
-        "test_fn_grad_linalg_eigh_xpu_complex128",
-        "test_fn_grad_linalg_eigh_xpu_float64",
-        "test_fn_grad_linalg_eigvals_xpu_complex128",
-        "test_fn_grad_linalg_eigvals_xpu_float64",
-        "test_fn_grad_linalg_eigvalsh_xpu_complex128",
-        "test_fn_grad_linalg_eigvalsh_xpu_float64",
-        "test_fn_grad_linalg_householder_product_xpu_complex128",
-        "test_fn_grad_linalg_householder_product_xpu_float64",
-        "test_fn_grad_linalg_inv_ex_xpu_complex128",
-        "test_fn_grad_linalg_inv_ex_xpu_float64",
-        "test_fn_grad_linalg_inv_xpu_complex128",
-        "test_fn_grad_linalg_inv_xpu_float64",
-        "test_fn_grad_linalg_lstsq_grad_oriented_xpu_complex128",
-        "test_fn_grad_linalg_lstsq_grad_oriented_xpu_float64",
-        "test_fn_grad_linalg_lu_factor_ex_xpu_complex128",
-        "test_fn_grad_linalg_lu_factor_ex_xpu_float64",
-        "test_fn_grad_linalg_lu_factor_xpu_complex128",
-        "test_fn_grad_linalg_lu_factor_xpu_float64",
-        "test_fn_grad_linalg_lu_solve_xpu_complex128",
-        "test_fn_grad_linalg_lu_solve_xpu_float64",
-        "test_fn_grad_linalg_lu_xpu_complex128",
-        "test_fn_grad_linalg_lu_xpu_float64",
-        "test_fn_grad_linalg_matrix_norm_xpu_complex128",
-        "test_fn_grad_linalg_matrix_norm_xpu_float64",
-        "test_fn_grad_linalg_matrix_power_xpu_complex128",
-        "test_fn_grad_linalg_matrix_power_xpu_float64",
-        "test_fn_grad_linalg_multi_dot_xpu_complex128",
-        "test_fn_grad_linalg_multi_dot_xpu_float64",
-        "test_fn_grad_linalg_norm_xpu_float64",
-        "test_fn_grad_linalg_pinv_hermitian_xpu_complex128",
-        "test_fn_grad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_grad_linalg_pinv_singular_xpu_complex128",
-        "test_fn_grad_linalg_pinv_singular_xpu_float64",
-        "test_fn_grad_linalg_pinv_xpu_complex128",
-        "test_fn_grad_linalg_pinv_xpu_float64",
-        "test_fn_grad_linalg_qr_xpu_complex128",
-        "test_fn_grad_linalg_qr_xpu_float64",
-        "test_fn_grad_linalg_slogdet_xpu_complex128",
-        "test_fn_grad_linalg_slogdet_xpu_float64",
-        "test_fn_grad_linalg_solve_ex_xpu_complex128",
-        "test_fn_grad_linalg_solve_ex_xpu_float64",
-        "test_fn_grad_linalg_solve_triangular_xpu_complex128",
-        "test_fn_grad_linalg_solve_triangular_xpu_float64",
-        "test_fn_grad_linalg_solve_xpu_complex128",
-        "test_fn_grad_linalg_solve_xpu_float64",
-        "test_fn_grad_linalg_svd_xpu_complex128",
-        "test_fn_grad_linalg_svd_xpu_float64",
-        "test_fn_grad_linalg_svdvals_xpu_complex128",
-        "test_fn_grad_linalg_svdvals_xpu_float64",
-        "test_fn_grad_linalg_tensorinv_xpu_complex128",
-        "test_fn_grad_linalg_tensorinv_xpu_float64",
-        "test_fn_grad_linalg_tensorsolve_xpu_complex128",
-        "test_fn_grad_linalg_tensorsolve_xpu_float64",
-        "test_fn_grad_logdet_xpu_complex128",
-        "test_fn_grad_logdet_xpu_float64",
-        "test_fn_grad_lu_solve_xpu_complex128",
-        "test_fn_grad_lu_solve_xpu_float64",
-        "test_fn_grad_lu_xpu_complex128",
-        "test_fn_grad_lu_xpu_float64",
-        "test_fn_grad_matmul_xpu_complex128",
-        "test_fn_grad_matmul_xpu_float64",
-        "test_fn_grad_mm_xpu_complex128",
-        "test_fn_grad_mm_xpu_float64",
-        "test_fn_grad_mv_xpu_complex128",
-        "test_fn_grad_mv_xpu_float64",
-        "test_fn_grad_nn_functional_bilinear_xpu_float64",
-        "test_fn_grad_nn_functional_linear_xpu_complex128",
-        "test_fn_grad_nn_functional_linear_xpu_float64",
-        "test_fn_grad_nn_functional_multi_head_attention_forward_xpu_float64",
-        "test_fn_grad_nn_functional_scaled_dot_product_attention_xpu_float64",
-        "test_fn_grad_norm_nuc_xpu_complex128",
-        "test_fn_grad_norm_nuc_xpu_float64",
-        "test_fn_grad_ormqr_xpu_complex128",
-        "test_fn_grad_ormqr_xpu_float64",
-        "test_fn_grad_pca_lowrank_xpu_float64",
-        "test_fn_grad_pinverse_xpu_complex128",
-        "test_fn_grad_pinverse_xpu_float64",
-        "test_fn_grad_qr_xpu_complex128",
-        "test_fn_grad_qr_xpu_float64",
-        "test_fn_grad_svd_lowrank_xpu_float64",
-        "test_fn_grad_svd_xpu_complex128",
-        "test_fn_grad_svd_xpu_float64",
-        "test_fn_grad_tensordot_xpu_complex128",
-        "test_fn_grad_tensordot_xpu_float64",
-        "test_fn_grad_triangular_solve_xpu_complex128",
-        "test_fn_grad_triangular_solve_xpu_float64",
-        "test_fn_gradgrad___rmatmul___xpu_complex128",
-        "test_fn_gradgrad___rmatmul___xpu_float64",
-        "test_fn_gradgrad_addbmm_xpu_float64",
-        "test_fn_gradgrad_addmm_decomposed_xpu_complex128",
-        "test_fn_gradgrad_addmm_decomposed_xpu_float64",
-        "test_fn_gradgrad_addmm_xpu_complex128",
-        "test_fn_gradgrad_addmm_xpu_float64",
-        "test_fn_gradgrad_addmv_xpu_complex128",
-        "test_fn_gradgrad_addmv_xpu_float64",
-        "test_fn_gradgrad_addr_xpu_complex128",
-        "test_fn_gradgrad_addr_xpu_float64",
-        "test_fn_gradgrad_baddbmm_xpu_complex128",
-        "test_fn_gradgrad_baddbmm_xpu_float64",
-        "test_fn_gradgrad_bmm_xpu_complex128",
-        "test_fn_gradgrad_bmm_xpu_float64",
-        "test_fn_gradgrad_cholesky_inverse_xpu_complex128",
-        "test_fn_gradgrad_cholesky_inverse_xpu_float64",
-        "test_fn_gradgrad_cholesky_solve_xpu_complex128",
-        "test_fn_gradgrad_cholesky_solve_xpu_float64",
-        "test_fn_gradgrad_cholesky_xpu_complex128",
-        "test_fn_gradgrad_cholesky_xpu_float64",
-        "test_fn_gradgrad_corrcoef_xpu_complex128",
-        "test_fn_gradgrad_corrcoef_xpu_float64",
-        "test_fn_gradgrad_einsum_xpu_complex128",
-        "test_fn_gradgrad_einsum_xpu_float64",
-        "test_fn_gradgrad_inner_xpu_complex128",
-        "test_fn_gradgrad_inner_xpu_float64",
-        "test_fn_gradgrad_linalg_cholesky_ex_xpu_complex128",
-        "test_fn_gradgrad_linalg_cholesky_ex_xpu_float64",
-        "test_fn_gradgrad_linalg_cholesky_xpu_complex128",
-        "test_fn_gradgrad_linalg_cholesky_xpu_float64",
-        "test_fn_gradgrad_linalg_cond_xpu_complex128",
-        "test_fn_gradgrad_linalg_cond_xpu_float64",
-        "test_fn_gradgrad_linalg_det_xpu_complex128",
-        "test_fn_gradgrad_linalg_det_xpu_float64",
-        "test_fn_gradgrad_linalg_eig_xpu_complex128",
-        "test_fn_gradgrad_linalg_eig_xpu_float64",
-        "test_fn_gradgrad_linalg_eigh_xpu_complex128",
-        "test_fn_gradgrad_linalg_eigh_xpu_float64",
-        "test_fn_gradgrad_linalg_eigvals_xpu_complex128",
-        "test_fn_gradgrad_linalg_eigvals_xpu_float64",
-        "test_fn_gradgrad_linalg_eigvalsh_xpu_complex128",
-        "test_fn_gradgrad_linalg_eigvalsh_xpu_float64",
-        "test_fn_gradgrad_linalg_householder_product_xpu_complex128",
-        "test_fn_gradgrad_linalg_householder_product_xpu_float64",
-        "test_fn_gradgrad_linalg_inv_ex_xpu_complex128",
-        "test_fn_gradgrad_linalg_inv_ex_xpu_float64",
-        "test_fn_gradgrad_linalg_inv_xpu_complex128",
-        "test_fn_gradgrad_linalg_inv_xpu_float64",
-        "test_fn_gradgrad_linalg_lstsq_grad_oriented_xpu_complex128",
-        "test_fn_gradgrad_linalg_lstsq_grad_oriented_xpu_float64",
-        "test_fn_gradgrad_linalg_lu_factor_ex_xpu_complex128",
-        "test_fn_gradgrad_linalg_lu_factor_ex_xpu_float64",
-        "test_fn_gradgrad_linalg_lu_factor_xpu_complex128",
-        "test_fn_gradgrad_linalg_lu_factor_xpu_float64",
-        "test_fn_gradgrad_linalg_lu_solve_xpu_complex128",
-        "test_fn_gradgrad_linalg_lu_solve_xpu_float64",
-        "test_fn_gradgrad_linalg_lu_xpu_complex128",
-        "test_fn_gradgrad_linalg_lu_xpu_float64",
-        "test_fn_gradgrad_linalg_matrix_norm_xpu_complex128",
-        "test_fn_gradgrad_linalg_matrix_norm_xpu_float64",
-        "test_fn_gradgrad_linalg_matrix_power_xpu_complex128",
-        "test_fn_gradgrad_linalg_matrix_power_xpu_float64",
-        "test_fn_gradgrad_linalg_multi_dot_xpu_complex128",
-        "test_fn_gradgrad_linalg_multi_dot_xpu_float64",
-        "test_fn_gradgrad_linalg_pinv_hermitian_xpu_complex128",
-        "test_fn_gradgrad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_gradgrad_linalg_pinv_singular_xpu_float64",
-        "test_fn_gradgrad_linalg_pinv_xpu_complex128",
-        "test_fn_gradgrad_linalg_pinv_xpu_float64",
-        "test_fn_gradgrad_linalg_qr_xpu_complex128",
-        "test_fn_gradgrad_linalg_qr_xpu_float64",
-        "test_fn_gradgrad_linalg_slogdet_xpu_complex128",
-        "test_fn_gradgrad_linalg_slogdet_xpu_float64",
-        "test_fn_gradgrad_linalg_solve_ex_xpu_complex128",
-        "test_fn_gradgrad_linalg_solve_ex_xpu_float64",
-        "test_fn_gradgrad_linalg_solve_triangular_xpu_complex128",
-        "test_fn_gradgrad_linalg_solve_triangular_xpu_float64",
-        "test_fn_gradgrad_linalg_solve_xpu_complex128",
-        "test_fn_gradgrad_linalg_solve_xpu_float64",
-        "test_fn_gradgrad_linalg_svd_xpu_complex128",
-        "test_fn_gradgrad_linalg_svd_xpu_float64",
-        "test_fn_gradgrad_linalg_svdvals_xpu_complex128",
-        "test_fn_gradgrad_linalg_svdvals_xpu_float64",
-        "test_fn_gradgrad_linalg_tensorinv_xpu_complex128",
-        "test_fn_gradgrad_linalg_tensorinv_xpu_float64",
-        "test_fn_gradgrad_linalg_tensorsolve_xpu_complex128",
-        "test_fn_gradgrad_linalg_tensorsolve_xpu_float64",
-        "test_fn_gradgrad_logdet_xpu_complex128",
-        "test_fn_gradgrad_logdet_xpu_float64",
-        "test_fn_gradgrad_lu_solve_xpu_complex128",
-        "test_fn_gradgrad_lu_solve_xpu_float64",
-        "test_fn_gradgrad_lu_xpu_complex128",
-        "test_fn_gradgrad_lu_xpu_float64",
-        "test_fn_gradgrad_matmul_xpu_complex128",
-        "test_fn_gradgrad_matmul_xpu_float64",
-        "test_fn_gradgrad_mm_xpu_complex128",
-        "test_fn_gradgrad_mm_xpu_float64",
-        "test_fn_gradgrad_mv_xpu_complex128",
-        "test_fn_gradgrad_mv_xpu_float64",
-        "test_fn_gradgrad_nn_functional_bilinear_xpu_float64",
-        "test_fn_gradgrad_nn_functional_linear_xpu_complex128",
-        "test_fn_gradgrad_nn_functional_linear_xpu_float64",
-        "test_fn_gradgrad_nn_functional_multi_head_attention_forward_xpu_float64",
-        "test_fn_gradgrad_nn_functional_scaled_dot_product_attention_xpu_float64",
-        "test_fn_gradgrad_norm_nuc_xpu_complex128",
-        "test_fn_gradgrad_norm_nuc_xpu_float64",
-        "test_fn_gradgrad_ormqr_xpu_complex128",
-        "test_fn_gradgrad_ormqr_xpu_float64",
-        "test_fn_gradgrad_pca_lowrank_xpu_float64",
-        "test_fn_gradgrad_pinverse_xpu_complex128",
-        "test_fn_gradgrad_pinverse_xpu_float64",
-        "test_fn_gradgrad_qr_xpu_complex128",
-        "test_fn_gradgrad_qr_xpu_float64",
-        "test_fn_gradgrad_svd_lowrank_xpu_float64",
-        "test_fn_gradgrad_svd_xpu_complex128",
-        "test_fn_gradgrad_svd_xpu_float64",
-        "test_fn_gradgrad_tensordot_xpu_complex128",
-        "test_fn_gradgrad_tensordot_xpu_float64",
-        "test_fn_gradgrad_triangular_solve_xpu_complex128",
-        "test_fn_gradgrad_triangular_solve_xpu_float64",
-        "test_inplace_grad_addbmm_xpu_float64",
-        "test_inplace_grad_addmm_decomposed_xpu_complex128",
-        "test_inplace_grad_addmm_decomposed_xpu_float64",
-        "test_inplace_grad_addmm_xpu_complex128",
-        "test_inplace_grad_addmm_xpu_float64",
-        "test_inplace_grad_addmv_xpu_complex128",
-        "test_inplace_grad_addmv_xpu_float64",
-        "test_inplace_grad_addr_xpu_complex128",
-        "test_inplace_grad_addr_xpu_float64",
-        "test_inplace_grad_baddbmm_xpu_complex128",
-        "test_inplace_grad_baddbmm_xpu_float64",
-        "test_inplace_gradgrad_addbmm_xpu_float64",
-        "test_inplace_gradgrad_addmm_decomposed_xpu_complex128",
-        "test_inplace_gradgrad_addmm_decomposed_xpu_float64",
-        "test_inplace_gradgrad_addmm_xpu_complex128",
-        "test_inplace_gradgrad_addmm_xpu_float64",
-        "test_inplace_gradgrad_addmv_xpu_complex128",
-        "test_inplace_gradgrad_addmv_xpu_float64",
-        "test_inplace_gradgrad_addr_xpu_complex128",
-        "test_inplace_gradgrad_addr_xpu_float64",
-        "test_inplace_gradgrad_baddbmm_xpu_complex128",
-        "test_inplace_gradgrad_baddbmm_xpu_float64",
-        "test_fn_grad_pca_lowrank_xpu_complex128",
-        "test_fn_grad_svd_lowrank_xpu_complex128",
-        "test_fn_gradgrad_pca_lowrank_xpu_complex128",
-        "test_fn_gradgrad_svd_lowrank_xpu_complex128",
-        "test_fn_grad_linalg_norm_xpu_complex128",
-        ### Error #1 in TestBwdGradientsXPU , totally 4 , RuntimeError: value cannot be converted to type float without overflow
-        "test_fn_grad_addbmm_xpu_complex128",
-        "test_fn_gradgrad_addbmm_xpu_complex128",
-        "test_inplace_grad_addbmm_xpu_complex128",
-        "test_inplace_gradgrad_addbmm_xpu_complex128",
-        ### rrelu_xpu op is not implemented,try these cases after implementing rrelu.
-        "test_fn_grad_nn_functional_rrelu_xpu_float64",
-        "test_fn_gradgrad_nn_functional_rrelu_xpu_float64",
-        "test_inplace_grad_nn_functional_rrelu_xpu_float64",
-        "test_inplace_gradgrad_nn_functional_rrelu_xpu_float64",
-        ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
-        "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128",
-        "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64",
-        "test_fn_grad_nn_functional_conv_transpose3d_xpu_complex128",
-        "test_fn_grad_nn_functional_conv_transpose3d_xpu_float64",
-        "test_fn_gradgrad_nn_functional_conv_transpose2d_xpu_complex128",
-        "test_fn_gradgrad_nn_functional_conv_transpose2d_xpu_float64",
-        "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_complex128",
-        "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_float64",
-        "test_fn_gradgrad_index_reduce_mean_xpu_float64",
-        "test_fn_gradgrad_index_reduce_prod_xpu_float64",
-        "test_inplace_gradgrad_index_reduce_mean_xpu_float64",
-        "test_inplace_gradgrad_index_reduce_prod_xpu_float64",
-        ### Error #7 in TestBwdGradientsXPU , totally 2 , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [XPU, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].
-        "test_fn_grad_to_sparse_xpu_float64",
-        "test_fn_gradgrad_to_sparse_xpu_float64",
-
-        # issue: https://github.com/intel/torch-xpu-ops/issues/809
-        "test_fn_gradgrad_nn_functional_conv3d_xpu_complex128",
-        "test_fn_gradgrad_nn_functional_conv3d_xpu_float64",
-    ),
-
     "test_torch_xpu.py": (
         # issue 302
         ### Error #0 in TestTorchDeviceTypeXPU , totally 11 , RuntimeError: expected scalar type Long but found Int
@@ -2627,79 +2327,6 @@
 
     "nn/test_pruning_xpu.py": None,
 
-    "test_foreach_xpu.py": (
-        # CPU fallback fails. Implementation difference between CPU and CUDA. Expect success on CPU and expect fail on CUDA. When we use CPU fallback and align expected fail list with CUDA, these cases fail.
-        # Unexpected success
-        "test_parity__foreach_ceil_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_ceil_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_ceil_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_ceil_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_clamp_max_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_clamp_max_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_clamp_max_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_clamp_max_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_clamp_min_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_clamp_min_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_clamp_min_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_clamp_min_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_erf_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_erf_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_erf_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_erf_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_erfc_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_erfc_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_erfc_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_erfc_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_floor_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_floor_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_floor_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_floor_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_frac_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_frac_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_frac_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_frac_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_lgamma_fastpath_inplace_xpu_bfloat16",
-        "test_parity__foreach_lgamma_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_lgamma_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_lgamma_fastpath_outplace_xpu_bfloat16",
-        "test_parity__foreach_lgamma_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_lgamma_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_maximum_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_maximum_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_maximum_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_maximum_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_minimum_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_minimum_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_minimum_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_minimum_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_round_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_round_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_round_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_round_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_sigmoid_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_sigmoid_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_sigmoid_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_sigmoid_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_sign_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_sign_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_sign_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_sign_fastpath_outplace_xpu_complex64",
-        "test_parity__foreach_trunc_fastpath_inplace_xpu_complex128",
-        "test_parity__foreach_trunc_fastpath_inplace_xpu_complex64",
-        "test_parity__foreach_trunc_fastpath_outplace_xpu_complex128",
-        "test_parity__foreach_trunc_fastpath_outplace_xpu_complex64",
-        "test_autodiff__foreach_sigmoid_inplace_xpu_complex128",
-        "test_autodiff__foreach_sigmoid_outplace_xpu_complex128",
-        "test_binary_op_with_scalar_self_support__foreach_pow_is_fastpath_True_xpu_bool",
-        # AssertionError: RuntimeError not raised
-        "test_0dim_tensor_overload_exception_xpu",
-        # RuntimeError: Tried to instantiate dummy base class CUDAGraph
-        "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float32",
-        "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float64",
-        "test_big_num_tensors__foreach_norm_use_cuda_graph_True_xpu_float32",
-        "test_big_num_tensors__foreach_norm_use_cuda_graph_True_xpu_float64",
-    ),
-
     "nn/test_convolution_xpu.py": (
         # XPU unsupport ops, skip.
         # https://github.com/intel/torch-xpu-ops/issues/348