diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 97ff00326..425ffc097 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -50,11 +50,9 @@
     "test_compare_cpu_exp2_xpu_complex128",
     "test_compare_cpu_exp2_xpu_complex64",
     "test_compare_cpu_nextafter_xpu_bfloat16",
-
     # skip random failure due to accuracy
     # AssertionError: Tensor-likes are not close!
     "test_compare_cpu_atan2_xpu_bfloat16",
-
     # CUDA does not support the data type either
     "test_compare_cpu_native_dropout_backward_xpu_bool",
     "test_compare_cpu_native_dropout_backward_xpu_int16",
@@ -63,22 +61,17 @@
     "test_compare_cpu_native_dropout_backward_xpu_int8",
     "test_compare_cpu_native_dropout_backward_xpu_uint8",
     "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
-
     # Need FP64 golden ref for more accurate comparison
     "test_compare_cpu_log_softmax_xpu_bfloat16",
-
     # TestCompositeCompliance
     # CPU fallback fails
     # Require implementing aten::embedding_renorm_
     "test_view_replay_nn_functional_embedding_xpu_float32",
-
     # TestCompositeCompliance::test_cow_input
     # XPU Tensor fails in copy-on-write cases
     # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization.
     # https://github.com/intel/torch-xpu-ops/issues/281
     "test_cow_input",
-
-
     # XPU implementation is correct.
     # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified)
     # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified)
@@ -86,36 +79,29 @@
     # https://en.cppreference.com/w/cpp/numeric/complex/exp
     "test_compare_cpu_sigmoid_xpu_complex64",
     "test_compare_cpu_sigmoid_xpu_complex128",
-
     # Align with CUDA dtypes - RuntimeError: "avg_pool2d_out_xpu" not implemented for 'Long'
     "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64",
-
     # Special handle (different calculation order) in CPU reference impl.
     # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
     "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float16",
-
     # Not implemented operators, aten::embedding_renorm_.
     # To retrieve cases when the operators are supported.
     # https://github.com/intel/torch-xpu-ops/issues/380
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float32",
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
-
-    #Double and complex datatype matmul is not supported in oneDNN
+    # Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
-
     # CPU reference fail. `abs_cpu` does not support bool.
     # The case should be skipped by PyTorch test infrastructure, but not be
     # skipped correctly after https://github.com/pytorch/pytorch/pull/124147
     # https://github.com/intel/torch-xpu-ops/issues/412
     "test_compare_cpu_abs_xpu_bool",
-
     # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16",
-
     # CPU result is not golden reference
     "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16",
     "test_compare_cpu_nn_functional_group_norm_xpu_float16",
@@ -130,25 +116,20 @@
     # Align with CUDA impl by using accumulate type. But CPU doesn't use.
     # When XPU uses original data type, the case passes.
     "test_compare_cpu_logit_xpu_bfloat16",
-
     # precison error
     #     Mismatched elements: 1 / 24 (4.2%)
     # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed)
     # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed)
     "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16",
-
     # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half'
     "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16",
-
     # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False...
     "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32",
     "test_cow_input_nn_functional_interpolate_linear_xpu_float32",
     "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32",
-
     #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent
     "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16",
     "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16",
-
     # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16",
@@ -157,17 +138,14 @@
     # Retrieve it once the operator is implemented.
     # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device.
     "test_forward_ad_nn_functional_glu_xpu_float32",
-
     # Precision error.
     # Mismatched elements: 1 / 812 (0.1%)
     # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed)
     # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
     "test_compare_cpu_hypot_xpu_bfloat16",
-
     # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
     # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
     "test_compare_cpu_polar_xpu_bfloat16",
-
     # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
     # https://github.com/intel/torch-xpu-ops/issues/549
     # Example fail log
@@ -179,25 +157,21 @@
     "test_compare_cpu_std_mean_xpu_bfloat16",
     "test_compare_cpu_sub_xpu_float16",
     "test_compare_cpu_var_mean_xpu_bfloat16",
-
     # test case doesn't make sense, will file an issue to track it.
     # https://github.com/pytorch/pytorch/issues/130916
     "test_compare_cpu_histogram_xpu_float32",
     "test_compare_cpu_histogram_xpu_float64",
-
     # Precision error.
     # Mismatched elements: 2 / 125 (1.6%)
     # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed)
     # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed)
     "test_compare_cpu_cumprod_xpu_bfloat16",
-
     # Precision error.
     # Mismatched elements: 1 / 9 (11.1%)
     # Greatest absolute difference: 0.001953125 at index (2, 2) (up to 0.001 allowed)
     # Greatest relative difference: 0.004669189453125 at index (2, 2) (up to 0.001 allowed)
     # Not in CUDA test scope too
     "test_compare_cpu_prod_xpu_bfloat16 ",
-
     # different results for value index due to unstable sort.
     # XPU and CUDA have the same result.
     "test_compare_cpu_median_xpu_int16",
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index ae19942f3..8921075c3 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -38,6 +38,8 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 res = 0
 
 # test_ops
+
+
 skip_list = (
     # Skip list of base line
     "test_dtypes___rmod___xpu",
@@ -84,8 +86,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
     "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",
     "test_out_warning_nanmean_xpu",
-    "test_out_warning_nn_functional_logsigmoid_xpu",
-    "test_python_ref__refs_div_trunc_rounding_xpu_bfloat16",
     "test_python_ref__refs_linspace_tensor_overload_xpu_int16",
     "test_python_ref__refs_linspace_tensor_overload_xpu_int32",
     "test_python_ref__refs_linspace_tensor_overload_xpu_int64",
@@ -108,7 +108,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref__refs_nn_functional_triplet_margin_loss_xpu_uint8",
     "test_python_ref__refs_square_xpu_bool",
     "test_python_ref__refs_trunc_xpu_float64",
-    "test_python_ref_executor__refs_div_trunc_rounding_executor_aten_xpu_bfloat16",
     "test_python_ref_executor__refs_geometric_executor_aten_xpu_bfloat16",
     "test_python_ref_executor__refs_geometric_executor_aten_xpu_float16",
     "test_python_ref_executor__refs_geometric_executor_aten_xpu_float32",
@@ -149,7 +148,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_executor__refs_square_executor_aten_xpu_bool",
     "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex128",
     "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex64",
-    "test_python_ref_torch_fallback__refs_div_trunc_rounding_xpu_bfloat16",
     "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int16",
     "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int32",
     "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int64",
@@ -209,9 +207,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_python_ref_torch_fallback__refs_square_xpu_complex64",
     # Skip list of new added when porting XPU operators.
     # See: https://github.com/intel/torch-xpu-ops/issues/128
-    "test_dtypes_scatter_reduce_amax_xpu",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-    "test_dtypes_scatter_reduce_amin_xpu",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-    "test_dtypes_scatter_reduce_prod_xpu",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
     "test_dtypes_view_as_complex_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
     "test_dtypes_view_as_real_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
     "test_noncontiguous_samples_native_dropout_backward_xpu_int64",  # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Long'.
@@ -223,15 +218,8 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_non_standard_bool_values_msort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'.
     "test_non_standard_bool_values_sort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "sort" not implemented for 'Bool'.
     "test_python_ref_executor__refs_pow_executor_aten_xpu_complex32",  # Didn't align with CUDA, Unexpected success
-    "test_compare_cpu_nn_functional_grid_sample_xpu_float32",  # AssertionError: Tensor-likes are not close!
-    "test_dtypes_nn_functional_batch_norm_without_cudnn_xpu",  # AssertionError: The supported dtypes for nn.functional.batch_norm on device type xpu are incorrect!
     # Unexpected success
     "test_errors_histogramdd_xpu",
-    "test_noncontiguous_samples__batch_norm_with_update_xpu_float32",
-    "test_out_histc_xpu_float32",
-    "test_out_warning_logcumsumexp_xpu",
-    "test_python_ref__refs_mul_xpu_complex32",
-    "test_python_ref_torch_fallback__refs_mul_xpu_complex32",
     # Jiterator is only supported on CUDA and ROCm GPUs, none are available.
     "_jiterator_",
     # https://github.com/intel/torch-xpu-ops/issues/157
@@ -755,35 +743,23 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # The following dtypes did not work in backward but are listed by the OpInfo: {torch.float16}.
     "test_dtypes_nn_functional_pad_replicate_negative_xpu",
     "test_dtypes_nn_functional_pad_replicate_xpu",
-
     # Op impl aligns with CUDA on the supported dtypes.
     # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'.
     # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
     # in XPU supported operators. Then the case will work.
     "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
     "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
-
-    # Numeric difference
-    # https://github.com/intel/torch-xpu-ops/issues/544
-    # Mismatched elements: 7 / 1048576 (0.0%)
-    # Greatest absolute difference: 0.4922053598013041 at index (765, 860) (up to 1e-07 allowed)
-    # Greatest relative difference: 0.15330001655652495 at index (765, 860) (up to 1e-07 allowed)
-    "test_python_ref__refs_log2_xpu_complex128",
-
-    #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
-    #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
-    #XPU supports bfloat16, CUDA doesn't support it.
+    # AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
+    # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
+    # XPU supports bfloat16, CUDA doesn't support it.
     "test_dtypes_unique_consecutive_xpu",
     "test_dtypes_unique_xpu",
-
     # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
     # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
     "test_dtypes_polar_xpu",
-
     # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
     # but test_dtypes infrastructure leverage CUDA supported datatypes
     "test_dtypes_histogram_xpu",
-
     # The following dtypes worked in forward but are not listed by the OpInfo: {torch.float16}.
     # Align with CPU implementation since,
     # 1. most cases of nextafter require Half dtype.
@@ -795,6 +771,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 
 
 # test_binary_ufuncs
+
 skip_list = (
     "test_fmod_remainder_by_zero_integral_xpu_int64",  # zero division is an undefined behavior: different handles on different backends
     "test_div_rounding_numpy_xpu_float16",  # Calculation error. XPU implementation uses opmath type.
@@ -807,9 +784,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_pow_xpu_int64",
     # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
     "_jiterator_",
-    # Unexpected success
-    "test_type_promotion_logaddexp_xpu",
-
     # nextafter: Numeric error due to `std::nextafter` difference between CPU (GCC) and XPU (SYCL)
     # https://github.com/intel/torch-xpu-ops/issues/623
     # AssertionError: Scalars are not equal!
@@ -822,17 +796,10 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 
 
 # test_scatter_gather_ops
+
 skip_list = (
     "test_gather_backward_with_empty_index_tensor_sparse_grad_True_xpu_float32",  # Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend.
     "test_gather_backward_with_empty_index_tensor_sparse_grad_True_xpu_float64",  # Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend.
-    "test_scatter__reductions_xpu_complex64",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'ComplexFloat'
-    "test_scatter_reduce_amax_xpu_bool",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'Bool'
-    "test_scatter_reduce_amin_xpu_bool",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'Bool'
-    "test_scatter_reduce_mean_xpu_complex128",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'ComplexDouble'
-    "test_scatter_reduce_mean_xpu_complex64",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'ComplexFloat'
-    "test_scatter_reduce_prod_xpu_bool",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'Bool'
-    "test_scatter_reduce_prod_xpu_complex128",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'ComplexDouble'
-    "test_scatter_reduce_prod_xpu_complex64",  # align CUDA dtype - RuntimeError: "scatter_gather_base_kernel_func" not implemented for 'ComplexFloat'
 )
 res += launch_test("test_scatter_gather_ops_xpu.py", skip_list)
 
@@ -841,9 +808,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 # test_sort_and_select
 
 
-skip_list = (
-    "test_sort_large_slice_xpu",  # Hard code CUDA
-)
+skip_list = ("test_sort_large_slice_xpu",)  # Hard code CUDA
 res += launch_test("test_sort_and_select_xpu.py", skip_list)
 
 nn_test_embedding_skip_list = (
@@ -1264,15 +1229,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_rnn_retain_variables_xpu_float64",
     "test_transformerencoderlayer_xpu_float64",
     "test_variable_sequence_xpu_float64",
-    # AssertionError: Scalars are not close!
-    "test_InstanceNorm1d_general_xpu",
-    "test_InstanceNorm2d_general_xpu",
-    "test_InstanceNorm3d_general_xpu",
     # AssertionError: RuntimeError not raised
-    "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bicubic_uint8_xpu_uint8",
-    "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_3_mode_bilinear_uint8_xpu_uint8",
-    "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_5_mode_bicubic_uint8_xpu_uint8",
-    "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_False_num_channels_5_mode_bilinear_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bicubic_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8",
     "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8",
@@ -1315,7 +1272,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # https://github.com/intel/torch-xpu-ops/issues/461
     "test_index_put_src_datatype_xpu_float8_e5m2",
     "test_index_put_src_datatype_xpu_float8_e4m3fn",
-
     # Regression after PyTorch update
     # http://github.com/intel/torch-xpu-ops/issues/549
     # IndexError: tensors used as indices must be long, byte or bool tensors.
@@ -1485,29 +1441,24 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_reference_numerics_large_asinh_xpu_complex128",
     "test_reference_numerics_large_asinh_xpu_complex64",
     "test_reference_numerics_large_asinh_xpu_complex32",
-
     # Mismatched elements: 1 / 943593 (0.0%)
     # Greatest absolute difference: 1.3363442121772096e-05 at index (742, 249) (up to 1e-05 allowed)
     # Greatest relative difference: 8.852276550896931e-06 at index (742, 249) (up to 1.3e-06 allowed)
     "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64",
-
     # AssertionError: Tensor-likes are not close!
     # exceeded maximum allowed difference
     # Greatest absolute difference: 6.266784475883469e-05 at index (463, 204) (up to 1e-05 allowed)
     # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed)
     "test_reference_numerics_normal__refs_asinh_xpu_complex64",
     "test_reference_numerics_normal_asinh_xpu_complex64",
-
     # Failed: Unexpected success
     "test_reference_numerics_large_rsqrt_xpu_complex32",
-
     # Numeric difference
     # https://github.com/intel/torch-xpu-ops/issues/544
     # Expected 0.00497517 but got 0.00497520063072443.
     # Absolute difference: 3.063072442997111e-08 (up to 0.0 allowed)
     # Relative difference: 6.156719153309558e-06 (up to 1e-06 allowed)
     "test_log1p_complex_xpu_complex64",
-
     # Issue: https://github.com/intel/torch-xpu-ops/issues/622
     # Mismatched elements: 8 / 943593 (0.0%)
     # Greatest absolute difference: inf at index (9, 860) (up to 0.001 allowed)
@@ -1863,8 +1814,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_compile_int4_mm_m_64_k_32_n_64_xpu",
     "test_compile_int4_mm_m_64_k_64_n_48_xpu",
     "test_compile_int4_mm_m_64_k_64_n_64_xpu",
-    # Short is not supported in oneDNN!
-    "test_mm_empty_inputs_mixed_dtype_errors_xpu",
     # XPU does not support tunable.
     "test_bmm_tunableop_rocm_xpu_float32",
     "test_numeric_check_leak_tunableop_rocm_xpu_float32",
@@ -2943,8 +2892,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float64",
     "test_big_num_tensors__foreach_norm_use_cuda_graph_True_xpu_float32",
     "test_big_num_tensors__foreach_norm_use_cuda_graph_True_xpu_float64",
-    # AssertionError: Tensor-likes are not close!
-    "test_pointwise_op_with_tensor_of_scalarlist_overload__foreach_addcdiv_is_fastpath_True_xpu_float16",
 )
 res += launch_test("test_foreach_xpu.py", skip_list)
 
@@ -2964,10 +2911,13 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 res += launch_test("nn/test_convolution_xpu.py", skip_list)
 
 # test_dynamic_shapes
+
+
 res += launch_test("test_dynamic_shapes_xpu.py")
 
 # test_load_state_dict
 
+
 res += launch_test("nn/test_load_state_dict_xpu.py")
 
 # test_module_hooks
@@ -2980,6 +2930,7 @@ def launch_test(test_case, skip_list=None, exe_list=None):
 
 # test_parametrization
 
+
 res += launch_test("nn/test_parametrization_xpu.py")
 
 exit_code = os.WEXITSTATUS(res)
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 483af50f7..c3a268024 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -4,6 +4,7 @@
 import copy
 import os
 import sys
+import unittest
 
 import torch
 from torch import bfloat16, cuda
@@ -230,9 +231,25 @@
     "copysign",
     "count_nonzero",
     "nan_to_num",
+    "scatter_reduce",
     "nanmean",
 ]
 
+# some case fail in cuda becasue of cuda's bug, so cuda set xfail in opdb
+# but xpu can pass these case, and assert 'unexpected success'
+# the list will pass these case.
+
+
+_cuda_xfail_xpu_pass = [
+    ("rsqrt", "test_reference_numerics_large"),
+    ("_batch_norm_with_update", "test_noncontiguous_samples"),
+    ("_batch_norm_with_update", "test_dispatch_symbolic_meta_outplace_all_strides"),
+    ("histc", "test_out"),
+    ("logcumsumexp", "test_out_warning"),
+    ("_refs.mul", "test_python_ref"),
+    ("_refs.mul", "test_python_ref_torch_fallback"),
+]
+
 
 def get_wrapped_fn(fn):
     if hasattr(fn, "__wrapped__"):
@@ -512,38 +529,35 @@ def __init__(self, patch_test_case=True) -> None:
         self.cuda_is_bf16_supported = cuda.is_bf16_supported
 
     def align_db_decorators(self, db):
-        for info in db:
-            decorator_xpu = []
+        def gen_xpu_wrappers(name, wrappers):
+            wrapper_xpu = []
             replaced = False
-            for decorator in info.decorators:
-                if type(decorator) == DecorateInfo:
-                    if decorator.device_type == "cuda":
-                        decorator_xpu.append(decorator)
-                        decorator.device_type = "xpu"
-                        replaced = True
-                    else:
-                        decorator_xpu.append(decorator)
-                elif self.only_cuda_fn == decorator:
-                    decorator_xpu.append(common_device_type.onlyCUDA)
+            for wrapper in wrappers:
+                if type(wrapper) == DecorateInfo:
+                    if wrapper.device_type == "cuda":
+                        if (
+                            unittest.expectedFailure in wrapper.decorators
+                            and (name, wrapper.test_name) in _cuda_xfail_xpu_pass
+                        ):
+                            pass
+                        else:
+                            wrapper.device_type = "xpu"
+                            replaced = True
+                    wrapper_xpu.append(wrapper)
+                elif self.only_cuda_fn == wrapper:
+                    wrapper_xpu.append(common_device_type.onlyCUDA)
                     replaced = True
-            if replaced:
-                info.decorators = tuple(decorator_xpu)
-            skip_xpu = []
-            replaced = False
+            return replaced, wrapper_xpu
+
+        for info in db:
+            if hasattr(info, "decorators"):
+                replaced, decorator_xpu = gen_xpu_wrappers(info.name, info.decorators)
+                if replaced:
+                    info.decorators = tuple(decorator_xpu)
             if hasattr(info, "skips"):
-                for skip in info.skips:
-                    if type(skip) == DecorateInfo:
-                        if skip.device_type == "cuda":
-                            skip_xpu.append(decorator)
-                            skip.device_type = "xpu"
-                            replaced = True
-                        else:
-                            skip_xpu.append(skip)
-                    elif self.only_cuda_fn == skip:
-                        skip_xpu.append(common_device_type.onlyCUDA)
-                        replaced = True
-            if replaced:
-                info.skips = tuple(skip_xpu)
+                replaced, skip_xpu = gen_xpu_wrappers(info.name, info.skips)
+                if replaced:
+                    info.skips = tuple(skip_xpu)
 
     def align_supported_dtypes(self, db):
         for opinfo in db: