diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index c67be7dc5..54c2ce75f 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -50,16 +50,7 @@
     "test_compare_cpu_exp2_xpu_complex128",
     "test_compare_cpu_exp2_xpu_complex64",
     "test_compare_cpu_nextafter_xpu_bfloat16",
-    # skip random failure due to accuracy
-    # AssertionError: Tensor-likes are not close!
-    "test_compare_cpu_atan2_xpu_bfloat16",
     # CUDA does not support the data type either
-    "test_compare_cpu_native_dropout_backward_xpu_bool",
-    "test_compare_cpu_native_dropout_backward_xpu_int16",
-    "test_compare_cpu_native_dropout_backward_xpu_int32",
-    "test_compare_cpu_native_dropout_backward_xpu_int64",
-    "test_compare_cpu_native_dropout_backward_xpu_int8",
-    "test_compare_cpu_native_dropout_backward_xpu_uint8",
     "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
     # Need FP64 golden ref for more accurate comparison
     "test_compare_cpu_log_softmax_xpu_bfloat16",
@@ -79,8 +70,6 @@
     # https://en.cppreference.com/w/cpp/numeric/complex/exp
     "test_compare_cpu_sigmoid_xpu_complex64",
     "test_compare_cpu_sigmoid_xpu_complex128",
-    # Align with CUDA dtypes - RuntimeError: "avg_pool2d_out_xpu" not implemented for 'Long'
-    "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64",
     # Special handle (different calculation order) in CPU reference impl.
     # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
     "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
@@ -93,11 +82,6 @@
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
     # Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
-    # CPU reference fail. `abs_cpu` does not support bool.
-    # The case should be skipped by PyTorch test infrastructure, but not be
-    # skipped correctly after https://github.com/pytorch/pytorch/pull/124147
-    # https://github.com/intel/torch-xpu-ops/issues/412
-    "test_compare_cpu_abs_xpu_bool",
     # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
@@ -146,28 +130,11 @@
     # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
     # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
     "test_compare_cpu_polar_xpu_bfloat16",
-    # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
-    # https://github.com/intel/torch-xpu-ops/issues/549
-    # Example fail log
-    # FAILED test_ops_xpu.py::TestCommonXPU::test_compare_cpu_nn_functional_batch_norm_xpu_float16 - AssertionError: Tensor-likes are not close!
-    # Mismatched elements: 3 / 72 (4.2%)
-    # Greatest absolute difference: 0.0029296875 at index (0, 1, 1, 0) (up to 0.001 allowed)
-    # Greatest relative difference: 0.0032501220703125 at index (2, 1, 2, 1) (up to 0.001 allowed)
-    "test_compare_cpu_nn_functional_batch_norm_xpu_float16",
-    "test_compare_cpu_std_mean_xpu_bfloat16",
-    "test_compare_cpu_sub_xpu_float16",
-    "test_compare_cpu_var_mean_xpu_bfloat16",
-    # Precision error.
-    # Mismatched elements: 2 / 125 (1.6%)
-    # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed)
-    # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed)
-    "test_compare_cpu_cumprod_xpu_bfloat16",
     # Precision error.
-    # Mismatched elements: 1 / 9 (11.1%)
-    # Greatest absolute difference: 0.001953125 at index (2, 2) (up to 0.001 allowed)
-    # Greatest relative difference: 0.004669189453125 at index (2, 2) (up to 0.001 allowed)
-    # Not in CUDA test scope too
-    "test_compare_cpu_prod_xpu_bfloat16 ",
+    # Mismatched elements: 1 / 25 (4.0%)
+    # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed)
+    # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed)
+    "test_compare_cpu_sub_xpu_float16",
     # different results for value index due to unstable sort.
     # XPU and CUDA have the same result.
     "test_compare_cpu_median_xpu_int16",
diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py
index c32508d2c..e6d46fccb 100644
--- a/test/xpu/extended/test_ops_xpu.py
+++ b/test/xpu/extended/test_ops_xpu.py
@@ -63,10 +63,20 @@ class Namespace:
     # Therefore, we build TestCommonProxy by inheriting the TestCommon and TestCase to ensure
     # the same feature set as the TestCommon.
     class TestCommonProxy(TestCase, TestCommonBase):
-        pass
+        def __init__(self, test_case = None):
+            if test_case:
+                # copy custom accuracy setting
+                self.maxDiff = test_case.maxDiff
+                self.precision = test_case.precision
+                self.rel_tol = test_case.rel_tol
 
     class TestCompositeComplianceProxy(TestCase, TestCompositeComplianceBase):
-        pass
+        def __init__(self, test_case = None):
+            if test_case:
+                # copy custom accuracy setting
+                self.maxDiff = test_case.maxDiff
+                self.precision = test_case.precision
+                self.rel_tol = test_case.rel_tol
 
 
 class TestCommon(TestCase):
@@ -78,13 +88,13 @@ class TestCommon(TestCase):
     def test_compare_cpu(self, device, dtype, op):
         # check if supported both by CPU and XPU
         if dtype in op.dtypes and dtype in op.supported_dtypes(device):
-            self.proxy = Namespace.TestCommonProxy()
+            self.proxy = Namespace.TestCommonProxy(self)
             test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
             test_common_test_fn(self.proxy, device, dtype, op)
         # for CUDA doesn't support operators
         elif (op.name in _ops_without_cuda_support):
             if dtype in op.dtypes:
-                self.proxy = Namespace.TestCommonProxy()
+                self.proxy = Namespace.TestCommonProxy(self)
                 test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
                 test_common_test_fn(self.proxy, device, dtype, op)
         else:
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index cf9bd687d..781b6693b 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -697,11 +697,6 @@
         "test_conj_view_to_sparse_xpu_complex64",
         "test_neg_conj_view_to_sparse_xpu_complex128",
         "test_neg_view_to_sparse_xpu_float64",
-        # # CPU fallback error：AssertionError: The supported dtypes for nn.functional.pad on device type xpu are incorrect!
-        # The following dtypes did not work in forward but are listed by the OpInfo: {torch.float16}.
-        # The following dtypes did not work in backward but are listed by the OpInfo: {torch.float16}.
-        "test_dtypes_nn_functional_pad_replicate_negative_xpu",
-        "test_dtypes_nn_functional_pad_replicate_xpu",
         # Op impl aligns with CUDA on the supported dtypes.
         # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'.
         # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
@@ -711,7 +706,6 @@
         #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
         #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
         #XPU supports bfloat16, CUDA doesn't support it.
-        "test_dtypes_unique_consecutive_xpu",
         "test_dtypes_unique_xpu",
         # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
         # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
@@ -1032,10 +1026,8 @@
         "test_save_load_nn_TransformerEncoder_train_mode_xpu_float64",
         "test_save_load_nn_Transformer_xpu_float64",
         # Unexpected success:
-        "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         "test_cpu_gpu_parity_nn_ConvTranspose1d_xpu_complex32",
-        "test_memory_format_nn_AvgPool2d_xpu_float32",
-        "test_memory_format_nn_AvgPool2d_xpu_float64",
+        "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         # CPU fallback could not cover these
         # CUDA xfails
         # Failed: Unexpected success
@@ -1104,31 +1096,12 @@
         "test_to_nn_TransformerEncoder_eval_mode_swap_True_set_grad_True_xpu_float32",
         "test_to_nn_TransformerEncoder_train_mode_swap_True_set_grad_True_xpu_float32",
         "test_to_nn_Transformer_swap_True_set_grad_True_xpu_float32",
-        #issue 746, adjust tolerence 
-        "test_non_contiguous_tensors_nn_Conv3d_xpu_float32",
     ),
 
     "test_nn_xpu.py": (
         # AttributeError: module 'torch.xpu' has no attribute 'FloatTensor'
         "test_type",
         # AssertionError: Tensor-likes are not close!
-        "test_Conv2d_dilated_with_long_tensor_cuda",
-        "test_Conv2d_groups_thnn_with_long_tensor_cuda",
-        "test_Conv2d_groups_with_long_tensor_cuda",
-        "test_Conv2d_no_bias_with_long_tensor_cuda",
-        "test_Conv2d_padding_with_long_tensor_cuda",
-        "test_Conv2d_strided_with_long_tensor_cuda",
-        "test_Conv2d_with_long_tensor_cuda",
-        "test_Conv3d_1x1x1_no_bias_with_long_tensor_cuda",
-        "test_Conv3d_groups_with_long_tensor_cuda",
-        "test_Conv3d_no_bias_with_long_tensor_cuda",
-        "test_Conv3d_stride_padding_with_long_tensor_cuda",
-        "test_Conv3d_stride_with_long_tensor_cuda",
-        "test_Conv3d_with_long_tensor_cuda",
-        "test_ConvTranspose2d_dilated_with_long_tensor_cuda",
-        "test_ConvTranspose2d_groups_with_long_tensor_cuda",
-        "test_ConvTranspose2d_no_bias_with_long_tensor_cuda",
-        "test_ConvTranspose2d_with_long_tensor_cuda",
         "test_RReLU_cuda",
         "test_RReLU_no_batch_dim_cuda",
         "test_RReLU_with_up_down_cuda",
@@ -1176,10 +1149,6 @@
         # AssertionError: False is not true
         "test_ctc_loss_cudnn_xpu",  # want "xpu" in function name
         "test_ctc_loss_cudnn_tensor",  # want "xpu" in function name
-        # RuntimeError: "smooth_l1_backward_cpu_out" not implemented for 'Half'
-        "test_SmoothL1Loss_no_batch_dim_mean_cuda_half",
-        "test_SmoothL1Loss_no_batch_dim_none_cuda_half",
-        "test_SmoothL1Loss_no_batch_dim_sum_cuda_half",
         # RuntimeError: "multilabel_margin_loss_forward_out_frame" not implemented for 'Half'
         "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half",
         "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half",
@@ -1230,8 +1199,6 @@
         # CPU only (vs Numpy). CUDA skips these cases since non-deterministic results are outputed for inf and nan.
         "test_float_to_int_conversion_finite_xpu_int8",
         "test_float_to_int_conversion_finite_xpu_int16",
-        # sparse
-        "test_tensor_ctor_device_inference_xpu",
         # Dispatch issue. It is a composite operator. But it is implemented by
         # DispatchStub. XPU doesn't support DispatchStub.
         "test_kaiser_window_xpu",
@@ -1316,11 +1283,6 @@
         "test_reference_numerics_large_asinh_xpu_complex128",
         "test_reference_numerics_large_asinh_xpu_complex64",
         "test_reference_numerics_large_asinh_xpu_complex32",
-        # Mismatched elements: 1 / 943593 (0.0%)
-        # Greatest absolute difference: 1.3363442121772096e-05 at index (742, 249) (up to 1e-05 allowed)
-        # Greatest relative difference: 8.852276550896931e-06 at index (742, 249) (up to 1.3e-06 allowed)
-        "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64",
-
         # AssertionError: Tensor-likes are not close!
         # exceeded maximum allowed difference
         # Greatest absolute difference: 6.266784475883469e-05 at index (463, 204) (up to 1e-05 allowed)
@@ -1468,8 +1430,6 @@
         # https://github.com/intel/torch-xpu-ops/issues/275
         # NotImplementedError: Could not run 'aten::empty_quantized' with arguments from the 'QuantizedXPU' backend.
         "test_flip_xpu_float32",
-        # RuntimeError: "trace" not implemented for 'Half'
-        "test_trace_xpu_float16",
     ),
 
     "test_content_store_xpu.py": None,
@@ -1729,7 +1689,6 @@
         "test_fn_fwgrad_bwgrad_linalg_cholesky_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_cond_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_cond_xpu_float64",
-        "test_fn_fwgrad_bwgrad_linalg_det_singular_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_det_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_det_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_eig_xpu_complex128",
@@ -1765,7 +1724,6 @@
         "test_fn_fwgrad_bwgrad_linalg_norm_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_float64",
@@ -1957,16 +1915,7 @@
         # torch.autograd.gradcheck.GradcheckError: While considering the real part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
         "test_fn_fwgrad_bwgrad_linalg_norm_xpu_complex128",
         # torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
-        "test_fn_fwgrad_bwgrad_linalg_vector_norm_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_masked_normalize_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_norm_inf_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_renorm_xpu_complex128",
         "test_forward_mode_AD_linalg_norm_xpu_complex128",
-        "test_forward_mode_AD_linalg_vector_norm_xpu_complex128",
-        "test_forward_mode_AD_masked_normalize_xpu_complex128",
-        "test_forward_mode_AD_norm_inf_xpu_complex128",
-        "test_forward_mode_AD_renorm_xpu_complex128",
-        "test_inplace_forward_mode_AD_renorm_xpu_complex128",
         # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_float64",
@@ -1976,9 +1925,6 @@
         "test_forward_mode_AD_nn_functional_conv_transpose2d_xpu_float64",
         "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_complex128",
         "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_float64",
-        # RuntimeError: input tensor must have at least one element, but got input_sizes = [1, 0, 1]
-        "test_fn_fwgrad_bwgrad_nn_functional_group_norm_xpu_float64",
-        "test_forward_mode_AD_nn_functional_group_norm_xpu_float64",
         # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
         "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64",
         "test_forward_mode_AD_nn_functional_rrelu_xpu_float64",
@@ -2010,11 +1956,6 @@
         "test_scaled_mm_vs_emulated_float16_xpu",
         "test_scaled_mm_vs_emulated_float32_xpu",
         "test_scaled_mm_vs_emulated_row_wise_bfloat16_xpu",
-        # https://github.com/intel/torch-xpu-ops/issues/676
-        # Mismatched elements: 9 / 1003002 (0.0%)
-        # Greatest absolute difference: 711.126220703125 at index (472, 999) (up to 0.1 allowed)
-        # Greatest relative difference: 2.7107455730438232 at index (472, 997) (up to 0.1 allowed)
-        "test_cublas_addmm_size_1000_xpu_float32",
     ),
 
     "test_maskedtensor_xpu.py": (
@@ -2110,6 +2051,7 @@
         "test_reduction_all_sum_layout1_xpu_float16",
         "test_reduction_all_sum_layout1_xpu_float64",
         # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
+        "test_like_",
         "test_invalid_sparse_layout_xpu",
         "test_to_dense_and_sparse_csr_xpu",
         "test_binary_core_add_layout2_xpu_float16",
@@ -2359,7 +2301,6 @@
         "test_fn_gradgrad_linalg_cholesky_xpu_float64",
         "test_fn_gradgrad_linalg_cond_xpu_complex128",
         "test_fn_gradgrad_linalg_cond_xpu_float64",
-        "test_fn_gradgrad_linalg_det_singular_xpu_float64",
         "test_fn_gradgrad_linalg_det_xpu_complex128",
         "test_fn_gradgrad_linalg_det_xpu_float64",
         "test_fn_gradgrad_linalg_eig_xpu_complex128",
@@ -2394,7 +2335,6 @@
         "test_fn_gradgrad_linalg_multi_dot_xpu_float64",
         "test_fn_gradgrad_linalg_pinv_hermitian_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_gradgrad_linalg_pinv_singular_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_singular_xpu_float64",
         "test_fn_gradgrad_linalg_pinv_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_xpu_float64",
@@ -2486,14 +2426,6 @@
         "test_fn_gradgrad_nn_functional_rrelu_xpu_float64",
         "test_inplace_grad_nn_functional_rrelu_xpu_float64",
         "test_inplace_gradgrad_nn_functional_rrelu_xpu_float64",
-        ### Error #3 in TestBwdGradientsXPU , totally 8 , torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex outputs only, Jacobian mismatch for output 0 with respect to input 0,
-        "test_fn_grad_masked_normalize_xpu_complex128",
-        "test_fn_grad_renorm_xpu_complex128",
-        "test_fn_gradgrad_linalg_vector_norm_xpu_complex128",
-        "test_fn_gradgrad_masked_normalize_xpu_complex128",
-        "test_fn_gradgrad_renorm_xpu_complex128",
-        "test_inplace_grad_renorm_xpu_complex128",
-        "test_inplace_gradgrad_renorm_xpu_complex128",
         ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64",
@@ -2503,8 +2435,6 @@
         "test_fn_gradgrad_nn_functional_conv_transpose2d_xpu_float64",
         "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_complex128",
         "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_float64",
-        ### Error #6 in TestBwdGradientsXPU , totally 5 , torch.autograd.gradcheck.GradcheckError: Backward is not reentrant, i.e., running backward with same input and grad_output multiple times gives different values, although analytical gradient matches numerical gradient.The tolerance for nondeterminism was 0.0.
-        "test_fn_grad_nn_functional_max_pool2d_xpu_float64",
         "test_fn_gradgrad_index_reduce_mean_xpu_float64",
         "test_fn_gradgrad_index_reduce_prod_xpu_float64",
         "test_inplace_gradgrad_index_reduce_mean_xpu_float64",
@@ -2672,7 +2602,6 @@
         "test_multihead_attention_dtype_batch_first_xpu_float64",
         "test_multihead_attention_dtype_xpu_float64",
         "test_multihead_attn_fast_path_query_and_bias_have_different_dtypes_xpu_float64",
-        "test_multihead_attn_fast_path_small_test_xpu_float64",
         "test_multihead_attn_in_proj_bias_none_xpu_float64",
         "test_multihead_attn_in_proj_weight_none_xpu_float64",
     ),
diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py
index 33c997ec7..d659197d9 100644
--- a/test/xpu/test_decomp_xpu.py
+++ b/test/xpu/test_decomp_xpu.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: intel"]
 
+import torch
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import run_tests
 
@@ -9,8 +10,72 @@
     from .xpu_test_utils import XPUPatchForImport
 
 with XPUPatchForImport(False):
+    import test_decomp
     from test_decomp import TestDecomp,DecompOneOffTests
 
+def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs):
+    assert orig.dtype == decomp.dtype, f"{i} Operation:  {op}"
+    if orig.numel() == 0 or decomp.numel() == 0:
+        assert orig.numel() == decomp.numel()
+        return
+    assert orig.shape == decomp.shape, f"{i} Operation:  {op}"
+    tol_table = {
+        (torch.bfloat16, torch.ops.aten.native_layer_norm.default): 1e-5,
+        (torch.float16, torch.ops.aten.native_layer_norm.default): 1e-5,
+        (torch.float16, torch.ops.aten.native_layer_norm_backward.default): 1e-3,
+        (torch.bfloat16, torch.ops.aten.native_layer_norm_backward.default): 2e-2,
+        (torch.bfloat16, torch.ops.aten.native_batch_norm.default): 1e-5,
+        (torch.float16, torch.ops.aten.native_batch_norm.default): 1e-5,
+        (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.default): 1e-5,
+        (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5,
+        (torch.float16, torch.ops.aten._native_batch_norm_legit.default): 1e-5,
+        (torch.float16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5,
+        (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-4,
+        (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-4,
+        (torch.bfloat16, torch.ops.aten.var_mean.correction): 5e-7,
+        (torch.float16, torch.ops.aten.var_mean.correction): 5e-7,
+        (torch.bfloat16, torch.ops.aten.var_mean.dim): 5e-7,
+        (torch.float16, torch.ops.aten.var_mean.dim): 5e-7,
+        (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
+        (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
+        (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2,
+        (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1,
+        (torch.float16, torch.ops.aten.hardswish.default): 2e-7,
+        (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7,
+        (torch.float16, torch.ops.aten.multi_margin_loss.default): 3e-2,
+        (torch.bfloat16, torch.ops.aten.multi_margin_loss.default): 5e-2,
+        (torch.float16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2,
+        (torch.bfloat16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2,
+        (torch.float16, torch.ops.aten.reflection_pad1d_backward.default): 5e-3,
+        (torch.bfloat16, torch.ops.aten.reflection_pad1d_backward.default): 5e-3,
+        (torch.float16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3,
+        (torch.bfloat16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3,
+        (torch.float16, torch.ops.aten.reflection_pad3d_backward.default): 5e-3,
+        (torch.bfloat16, torch.ops.aten.reflection_pad3d_backward.default): 5e-2,
+        # see https://github.com/pytorch/pytorch/pull/96264
+        (torch.float16, torch.ops.aten.mv.default): 1e-5,
+        (torch.bfloat16, torch.ops.aten.mv.default): 1e-5,
+        (torch.float16, torch.ops.aten.log_sigmoid_backward.default): 2e-5,
+        (torch.float16, torch.ops.aten._batch_norm_with_update.default): 2e-7, # adjust tolerance for xpu, so hook this func
+        (torch.bfloat16, torch.ops.aten._batch_norm_with_update.default): 2e-7, # adjust tolerance for xpu, so hook this func
+    }
+    if ref.is_floating_point():
+        orig_diff = (orig - ref).abs().max()
+        decomp_diff = (decomp - ref).abs().max()
+        atol = tol_table.get((test_dtype, op), 1e-7)
+        if decomp_diff > orig_diff + atol:
+            raise RuntimeError(
+                f"Difference from float64 is larger with decomposition {op.__name__}"
+                f" than original on output {i}. Original max diff: {orig_diff}, Decomp max diff: {decomp_diff}\n"
+                f"atol = {atol}\n"
+                f"args = {args}\n"
+                f"kwargs = {kwargs}"
+            )
+    else:
+        test_case.assertEqual(
+            orig, decomp, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}"
+        )
+test_decomp.op_assert_ref=_op_assert_ref
 
 instantiate_device_type_tests(TestDecomp, globals(), only_for="xpu", allow_xpu=True)
 instantiate_device_type_tests(DecompOneOffTests, globals(), only_for="xpu", allow_xpu=True)
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
index 80fb3c8b0..1a4a57a41 100644
--- a/test/xpu/test_torch_xpu.py
+++ b/test/xpu/test_torch_xpu.py
@@ -6974,11 +6974,7 @@ def helper(dim, dtype, device, size_result, size_source):
             ref_out = tensor.index_add(dim, index, source, alpha=2.) / 2.
             ref_out = ref_out.to(dtype=dtype)
             out = tensor.index_add(dim, index, source)
-            if device == 'cuda' or device == 'xpu':
-                self.assertEqual(out, ref_out, atol=1e-2, rtol=1e-2)
-            else:
-                # scatter_add uses fp32 as accumulate type, while index_add doesn't.
-                self.assertEqual(out, ref_out.to(dtype=dtype), atol=1e-2, rtol=1e-2)
+            self.assertEqual(out, ref_out, atol=7e-2, rtol=1.2e-2)
 
         for dim in [-1, -2, -3]:
             for dtype in all_types_and_complex_and(torch.half, torch.bfloat16):
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 1c7dbcdff..c6c239cca 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -14,6 +14,7 @@
     common_methods_invocations,
     common_utils,
 )
+from torch.testing._internal.common_device_type import tol, toleranceOverride
 from torch.testing._internal.common_modules import module_db
 from torch.testing._internal.common_nn import CriterionTest, ModuleTest
 from torch.testing._internal.common_utils import set_default_dtype
@@ -255,11 +256,85 @@
     ("logcumsumexp", "test_out_warning"),
     ("_refs.mul", "test_python_ref"),
     ("_refs.mul", "test_python_ref_torch_fallback"),
+    ("nn.AvgPool2d", "test_memory_format"),
     ("narrow_copy","test_meta_outplace"),
     ("narrow_copy","test_dispatch_meta_outplace"),
     ("narrow_copy","test_dispatch_symbolic_meta_outplace"),
 ]
 
+# some case should adjust tolerance to pass.
+# The new threshold is at the same order of magnitude as cuda's or cpu's.
+# format hint:{op_name:{(cls_name,test_name):{dtype:tol(atol, rtol)}}
+
+_xpu_tolerance_override = {
+    "nn.functional.tanhshrink": {
+        ("TestUnaryUfuncs", "test_reference_numerics_normal"): {
+            torch.complex64: tol(atol=2e-05, rtol=9e-06),
+            torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02),
+        }
+    },
+    "atan2": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.008, rtol=0.005),
+        }
+    },
+    "cumprod": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.002, rtol=0.008),
+        }
+    },
+    "nanmean": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.002, rtol=0.008),
+        }
+    },
+    "nansum": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.008, rtol=0.006),
+        }
+    },
+    "nn.functional.batch_norm": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.float16: tol(atol=0.003, rtol=0.004),
+        }
+    },
+    "nn.functional.embedding_bag": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.float16: tol(atol=0.005, rtol=0.007),
+        }
+    },
+    "nn.functional.group_norm": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.float16: tol(atol=0.002, rtol=0.006),
+        }
+    },
+    "prod": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.002, rtol=0.005),
+        }
+    },
+    "rsqrt": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.004, rtol=0.007),
+        }
+    },
+    "std_mean": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.008, rtol=0.005),
+        }
+    },
+    "var_mean": {
+        ("TestCommon", "test_compare_cpu"): {
+            torch.bfloat16: tol(atol=0.008, rtol=0.005),
+        }
+    },
+    "nn.LazyConvTranspose3d": {
+        ("TestModule", "test_non_contiguous_tensors"): {
+            torch.float32: tol(atol=2e-5, rtol=5e-5),
+        }
+    },
+}
+
 
 def get_wrapped_fn(fn):
     if hasattr(fn, "__wrapped__"):
@@ -296,6 +371,8 @@ def to_xpu(obj, type_map=None):
 
 
 def ModuleTest_test_xpu(self, test_case):
+    if not self.should_test_cuda:
+        raise unittest.SkipTest("Excluded from XPU tests")
     with set_default_dtype(self.default_dtype):
         cpu_input = self._get_input()
 
@@ -320,13 +397,6 @@ def ModuleTest_test_xpu(self, test_case):
         test_case._zero_grad_parameters(xpu_module)
         cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
         xpu_output = test_case._forward(xpu_module, xpu_input_tuple)
-        test_case.assertEqual(
-            cpu_input_tuple,
-            xpu_input_tuple,
-            atol=self.precision,
-            rtol=0,
-            exact_dtype=False,
-        )
         if getattr(cpu_module, "return_indices", False):
             cpu_output = cpu_output[0]
             xpu_output = xpu_output[0]
@@ -334,18 +404,11 @@ def ModuleTest_test_xpu(self, test_case):
             cpu_output, xpu_output, atol=self.precision, rtol=0, exact_dtype=False
         )
 
-        # Run backwards on CPU and GPU and compare results
+        # Run backwards on CPU and xpu and compare results
 
         for _ in range(5):
             cpu_gradOutput = cpu_output.clone().normal_()
             xpu_gradOutput = cpu_gradOutput.type_as(xpu_output)
-            test_case.assertEqual(
-                cpu_input_tuple,
-                xpu_input_tuple,
-                atol=self.precision,
-                rtol=0,
-                exact_dtype=False,
-            )
             cpu_gradInput = test_case._backward(
                 cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput
             )
@@ -361,7 +424,7 @@ def ModuleTest_test_xpu(self, test_case):
             )
             for cpu_d_p, xpu_d_p in zip(cpu_param[1], xpu_param[1]):
                 test_case.assertEqual(cpu_d_p, xpu_d_p, atol=self.precision, rtol=0)
-        # Run double-backwards on CPU and GPU and compare results
+        # Run double-backwards on CPU and xpu and compare results
 
         if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
             cpu_output = cpu_module(*cpu_input_tuple)
@@ -441,6 +504,8 @@ def convert_dtype(obj, dtype, requires_grad=False):
         else:
             return obj
 
+    if not self.should_test_cuda:
+        raise unittest.SkipTest("Excluded from XPU tests")
     with set_default_dtype(self.default_dtype):
         cpu_input = self._get_input()
         cpu_target = self._get_target()
@@ -539,7 +604,7 @@ def __init__(self, patch_test_case=True) -> None:
         self.cuda_is_bf16_supported = cuda.is_bf16_supported
 
     def align_db_decorators(self, db):
-        def gen_xpu_wrappers(name, wrappers):
+        def gen_xpu_wrappers(op_name, wrappers):
             wrapper_xpu = []
             replaced = False
             for wrapper in wrappers:
@@ -547,7 +612,7 @@ def gen_xpu_wrappers(name, wrappers):
                     if wrapper.device_type == "cuda":
                         if (
                             unittest.expectedFailure in wrapper.decorators
-                            and (name, wrapper.test_name) in _cuda_xfail_xpu_pass
+                            and (op_name, wrapper.test_name) in _cuda_xfail_xpu_pass
                         ):
                             pass
                         else:
@@ -562,6 +627,19 @@ def gen_xpu_wrappers(name, wrappers):
         for info in db:
             if hasattr(info, "decorators"):
                 replaced, decorator_xpu = gen_xpu_wrappers(info.name, info.decorators)
+
+                # the latter decorator will override the former.
+                if info.name in _xpu_tolerance_override:
+                    replaced = True
+                    for case, tolerance in _xpu_tolerance_override[info.name].items():
+                        decorator_xpu.append(
+                            DecorateInfo(
+                                toleranceOverride(tolerance),
+                                case[0],  # cls_name
+                                case[1],  # test_name
+                                device_type="xpu",
+                            )
+                        )
                 if replaced:
                     info.decorators = tuple(decorator_xpu)
             if hasattr(info, "skips"):