adjust tolerance for xpu in utils (#749)

fix some accuracy problems. 1. add decorator `toleranceOverride` to opinfo when hooking opdb. This method can handle the cases with accuracy problems wrapped by `ops`. (except the difference is nan or inf) 2. fix test_decomp, test_torch accuracy problem. 3. align lastest pytorch code of `ModuleTest_test` "test_Conv2d_dilated_with_long_tensor_cuda", "test_Conv2d_groups_thnn_with_long_tensor_cuda", "test_Conv2d_groups_with_long_tensor_cuda", "test_Conv2d_no_bias_with_long_tensor_cuda", "test_Conv2d_padding_with_long_tensor_cuda", "test_Conv2d_strided_with_long_tensor_cuda", "test_Conv2d_with_long_tensor_cuda", "test_Conv3d_1x1x1_no_bias_with_long_tensor_cuda", "test_Conv3d_groups_with_long_tensor_cuda", "test_Conv3d_no_bias_with_long_tensor_cuda", "test_Conv3d_stride_padding_with_long_tensor_cuda", "test_Conv3d_stride_with_long_tensor_cuda", "test_Conv3d_with_long_tensor_cuda", "test_ConvTranspose2d_dilated_with_long_tensor_cuda", "test_ConvTranspose2d_groups_with_long_tensor_cuda", "test_ConvTranspose2d_no_bias_with_long_tensor_cuda", "test_ConvTranspose2d_with_long_tensor_cuda", 5. add 'nn_AvgPool2d' to cuda xfail list "test_memory_format_nn_AvgPool2d_xpu_float32", "test_memory_format_nn_AvgPool2d_xpu_float64", 7. clean skiplist, remove pass cases with lastest code. "test_compare_cpu_native_dropout_backward_xpu_bool", "test_compare_cpu_native_dropout_backward_xpu_int16", "test_compare_cpu_native_dropout_backward_xpu_int32", "test_compare_cpu_native_dropout_backward_xpu_int64", "test_compare_cpu_native_dropout_backward_xpu_int8", "test_compare_cpu_native_dropout_backward_xpu_uint8", "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64", "test_compare_cpu_abs_xpu_bool", "test_dtypes_nn_functional_linear_xpu", "test_dtypes_nn_functional_pad_replicate_negative_xpu", "test_dtypes_nn_functional_pad_replicate_xpu", "test_dtypes_unique_consecutive_xpu", "test_SmoothL1Loss_no_batch_dim_mean_cuda_half", "test_SmoothL1Loss_no_batch_dim_none_cuda_half", "test_SmoothL1Loss_no_batch_dim_sum_cuda_half", "test_tensor_ctor_device_inference_xpu", "test_trace_xpu_float16", "test_fn_fwgrad_bwgrad_linalg_det_singular_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_vector_norm_xpu_complex128", "test_fn_fwgrad_bwgrad_masked_normalize_xpu_complex128", "test_fn_fwgrad_bwgrad_norm_inf_xpu_complex128", "test_fn_fwgrad_bwgrad_renorm_xpu_complex128", "test_forward_mode_AD_linalg_vector_norm_xpu_complex128", "test_forward_mode_AD_masked_normalize_xpu_complex128", "test_forward_mode_AD_norm_inf_xpu_complex128", "test_forward_mode_AD_renorm_xpu_complex128", "test_inplace_forward_mode_AD_renorm_xpu_complex128", "test_fn_fwgrad_bwgrad_nn_functional_group_norm_xpu_float64", "test_forward_mode_AD_nn_functional_group_norm_xpu_float64", "test_fn_gradgrad_linalg_det_singular_xpu_float64", "test_fn_gradgrad_linalg_pinv_singular_xpu_complex128", "test_fn_grad_masked_normalize_xpu_complex128", "test_fn_grad_renorm_xpu_complex128", "test_fn_gradgrad_linalg_vector_norm_xpu_complex128", "test_fn_gradgrad_masked_normalize_xpu_complex128", "test_fn_gradgrad_renorm_xpu_complex128", "test_inplace_grad_renorm_xpu_complex128", "test_inplace_gradgrad_renorm_xpu_complex128", "test_fn_grad_nn_functional_max_pool2d_xpu_float64", "test_multihead_attn_fast_path_small_test_xpu_float64", --------- Co-authored-by: Feng Yuan <[email protected]>
intel · Aug 21, 2024 · e040874 · e040874
1 parent 97532c0
commit e040874
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 137 deletions.
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
@@ -50,16 +50,7 @@
     "test_compare_cpu_exp2_xpu_complex128",
     "test_compare_cpu_exp2_xpu_complex64",
     "test_compare_cpu_nextafter_xpu_bfloat16",
-    # skip random failure due to accuracy
-    # AssertionError: Tensor-likes are not close!
-    "test_compare_cpu_atan2_xpu_bfloat16",
     # CUDA does not support the data type either
-    "test_compare_cpu_native_dropout_backward_xpu_bool",
-    "test_compare_cpu_native_dropout_backward_xpu_int16",
-    "test_compare_cpu_native_dropout_backward_xpu_int32",
-    "test_compare_cpu_native_dropout_backward_xpu_int64",
-    "test_compare_cpu_native_dropout_backward_xpu_int8",
-    "test_compare_cpu_native_dropout_backward_xpu_uint8",
     "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
     # Need FP64 golden ref for more accurate comparison
     "test_compare_cpu_log_softmax_xpu_bfloat16",
@@ -79,8 +70,6 @@
     # https://en.cppreference.com/w/cpp/numeric/complex/exp
     "test_compare_cpu_sigmoid_xpu_complex64",
     "test_compare_cpu_sigmoid_xpu_complex128",
-    # Align with CUDA dtypes - RuntimeError: "avg_pool2d_out_xpu" not implemented for 'Long'
-    "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64",
     # Special handle (different calculation order) in CPU reference impl.
     # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
     "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
@@ -93,11 +82,6 @@
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
     # Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
-    # CPU reference fail. `abs_cpu` does not support bool.
-    # The case should be skipped by PyTorch test infrastructure, but not be
-    # skipped correctly after https://github.com/pytorch/pytorch/pull/124147
-    # https://github.com/intel/torch-xpu-ops/issues/412
-    "test_compare_cpu_abs_xpu_bool",
     # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
@@ -146,28 +130,11 @@
     # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
     # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
     "test_compare_cpu_polar_xpu_bfloat16",
-    # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
-    # https://github.com/intel/torch-xpu-ops/issues/549
-    # Example fail log
-    # FAILED test_ops_xpu.py::TestCommonXPU::test_compare_cpu_nn_functional_batch_norm_xpu_float16 - AssertionError: Tensor-likes are not close!
-    # Mismatched elements: 3 / 72 (4.2%)
-    # Greatest absolute difference: 0.0029296875 at index (0, 1, 1, 0) (up to 0.001 allowed)
-    # Greatest relative difference: 0.0032501220703125 at index (2, 1, 2, 1) (up to 0.001 allowed)
-    "test_compare_cpu_nn_functional_batch_norm_xpu_float16",
-    "test_compare_cpu_std_mean_xpu_bfloat16",
-    "test_compare_cpu_sub_xpu_float16",
-    "test_compare_cpu_var_mean_xpu_bfloat16",
-    # Precision error.
-    # Mismatched elements: 2 / 125 (1.6%)
-    # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed)
-    # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed)
-    "test_compare_cpu_cumprod_xpu_bfloat16",
     # Precision error.
-    # Mismatched elements: 1 / 9 (11.1%)
-    # Greatest absolute difference: 0.001953125 at index (2, 2) (up to 0.001 allowed)
-    # Greatest relative difference: 0.004669189453125 at index (2, 2) (up to 0.001 allowed)
-    # Not in CUDA test scope too
-    "test_compare_cpu_prod_xpu_bfloat16 ",
+    # Mismatched elements: 1 / 25 (4.0%)
+    # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed)
+    # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed)
+    "test_compare_cpu_sub_xpu_float16",
     # different results for value index due to unstable sort.
     # XPU and CUDA have the same result.
     "test_compare_cpu_median_xpu_int16",

diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py
@@ -63,10 +63,20 @@ class Namespace:
     # Therefore, we build TestCommonProxy by inheriting the TestCommon and TestCase to ensure
     # the same feature set as the TestCommon.
     class TestCommonProxy(TestCase, TestCommonBase):
-        pass
+        def __init__(self, test_case = None):
+            if test_case:
+                # copy custom accuracy setting
+                self.maxDiff = test_case.maxDiff
+                self.precision = test_case.precision
+                self.rel_tol = test_case.rel_tol
 
     class TestCompositeComplianceProxy(TestCase, TestCompositeComplianceBase):
-        pass
+        def __init__(self, test_case = None):
+            if test_case:
+                # copy custom accuracy setting
+                self.maxDiff = test_case.maxDiff
+                self.precision = test_case.precision
+                self.rel_tol = test_case.rel_tol
 
 
 class TestCommon(TestCase):
@@ -78,13 +88,13 @@ class TestCommon(TestCase):
     def test_compare_cpu(self, device, dtype, op):
         # check if supported both by CPU and XPU
         if dtype in op.dtypes and dtype in op.supported_dtypes(device):
-            self.proxy = Namespace.TestCommonProxy()
+            self.proxy = Namespace.TestCommonProxy(self)
             test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
             test_common_test_fn(self.proxy, device, dtype, op)
         # for CUDA doesn't support operators
         elif (op.name in _ops_without_cuda_support):
             if dtype in op.dtypes:
-                self.proxy = Namespace.TestCommonProxy()
+                self.proxy = Namespace.TestCommonProxy(self)
                 test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu)
                 test_common_test_fn(self.proxy, device, dtype, op)
         else:

diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
@@ -697,11 +697,6 @@
         "test_conj_view_to_sparse_xpu_complex64",
         "test_neg_conj_view_to_sparse_xpu_complex128",
         "test_neg_view_to_sparse_xpu_float64",
-        # # CPU fallback error：AssertionError: The supported dtypes for nn.functional.pad on device type xpu are incorrect!
-        # The following dtypes did not work in forward but are listed by the OpInfo: {torch.float16}.
-        # The following dtypes did not work in backward but are listed by the OpInfo: {torch.float16}.
-        "test_dtypes_nn_functional_pad_replicate_negative_xpu",
-        "test_dtypes_nn_functional_pad_replicate_xpu",
         # Op impl aligns with CUDA on the supported dtypes.
         # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'.
         # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
@@ -711,7 +706,6 @@
         #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
         #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
         #XPU supports bfloat16, CUDA doesn't support it.
-        "test_dtypes_unique_consecutive_xpu",
         "test_dtypes_unique_xpu",
         # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
         # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
@@ -1032,10 +1026,8 @@
         "test_save_load_nn_TransformerEncoder_train_mode_xpu_float64",
         "test_save_load_nn_Transformer_xpu_float64",
         # Unexpected success:
-        "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         "test_cpu_gpu_parity_nn_ConvTranspose1d_xpu_complex32",
-        "test_memory_format_nn_AvgPool2d_xpu_float32",
-        "test_memory_format_nn_AvgPool2d_xpu_float64",
+        "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         # CPU fallback could not cover these
         # CUDA xfails
         # Failed: Unexpected success
@@ -1104,31 +1096,12 @@
         "test_to_nn_TransformerEncoder_eval_mode_swap_True_set_grad_True_xpu_float32",
         "test_to_nn_TransformerEncoder_train_mode_swap_True_set_grad_True_xpu_float32",
         "test_to_nn_Transformer_swap_True_set_grad_True_xpu_float32",
-        #issue 746, adjust tolerence 
-        "test_non_contiguous_tensors_nn_Conv3d_xpu_float32",
     ),
 
     "test_nn_xpu.py": (
         # AttributeError: module 'torch.xpu' has no attribute 'FloatTensor'
         "test_type",
         # AssertionError: Tensor-likes are not close!
-        "test_Conv2d_dilated_with_long_tensor_cuda",
-        "test_Conv2d_groups_thnn_with_long_tensor_cuda",
-        "test_Conv2d_groups_with_long_tensor_cuda",
-        "test_Conv2d_no_bias_with_long_tensor_cuda",
-        "test_Conv2d_padding_with_long_tensor_cuda",
-        "test_Conv2d_strided_with_long_tensor_cuda",
-        "test_Conv2d_with_long_tensor_cuda",
-        "test_Conv3d_1x1x1_no_bias_with_long_tensor_cuda",
-        "test_Conv3d_groups_with_long_tensor_cuda",
-        "test_Conv3d_no_bias_with_long_tensor_cuda",
-        "test_Conv3d_stride_padding_with_long_tensor_cuda",
-        "test_Conv3d_stride_with_long_tensor_cuda",
-        "test_Conv3d_with_long_tensor_cuda",
-        "test_ConvTranspose2d_dilated_with_long_tensor_cuda",
-        "test_ConvTranspose2d_groups_with_long_tensor_cuda",
-        "test_ConvTranspose2d_no_bias_with_long_tensor_cuda",
-        "test_ConvTranspose2d_with_long_tensor_cuda",
         "test_RReLU_cuda",
         "test_RReLU_no_batch_dim_cuda",
         "test_RReLU_with_up_down_cuda",
@@ -1176,10 +1149,6 @@
         # AssertionError: False is not true
         "test_ctc_loss_cudnn_xpu",  # want "xpu" in function name
         "test_ctc_loss_cudnn_tensor",  # want "xpu" in function name
-        # RuntimeError: "smooth_l1_backward_cpu_out" not implemented for 'Half'
-        "test_SmoothL1Loss_no_batch_dim_mean_cuda_half",
-        "test_SmoothL1Loss_no_batch_dim_none_cuda_half",
-        "test_SmoothL1Loss_no_batch_dim_sum_cuda_half",
         # RuntimeError: "multilabel_margin_loss_forward_out_frame" not implemented for 'Half'
         "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half",
         "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half",
@@ -1230,8 +1199,6 @@
         # CPU only (vs Numpy). CUDA skips these cases since non-deterministic results are outputed for inf and nan.
         "test_float_to_int_conversion_finite_xpu_int8",
         "test_float_to_int_conversion_finite_xpu_int16",
-        # sparse
-        "test_tensor_ctor_device_inference_xpu",
         # Dispatch issue. It is a composite operator. But it is implemented by
         # DispatchStub. XPU doesn't support DispatchStub.
         "test_kaiser_window_xpu",
@@ -1316,11 +1283,6 @@
         "test_reference_numerics_large_asinh_xpu_complex128",
         "test_reference_numerics_large_asinh_xpu_complex64",
         "test_reference_numerics_large_asinh_xpu_complex32",
-        # Mismatched elements: 1 / 943593 (0.0%)
-        # Greatest absolute difference: 1.3363442121772096e-05 at index (742, 249) (up to 1e-05 allowed)
-        # Greatest relative difference: 8.852276550896931e-06 at index (742, 249) (up to 1.3e-06 allowed)
-        "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64",
-
         # AssertionError: Tensor-likes are not close!
         # exceeded maximum allowed difference
         # Greatest absolute difference: 6.266784475883469e-05 at index (463, 204) (up to 1e-05 allowed)
@@ -1468,8 +1430,6 @@
         # https://github.com/intel/torch-xpu-ops/issues/275
         # NotImplementedError: Could not run 'aten::empty_quantized' with arguments from the 'QuantizedXPU' backend.
         "test_flip_xpu_float32",
-        # RuntimeError: "trace" not implemented for 'Half'
-        "test_trace_xpu_float16",
     ),
 
     "test_content_store_xpu.py": None,
@@ -1729,7 +1689,6 @@
         "test_fn_fwgrad_bwgrad_linalg_cholesky_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_cond_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_cond_xpu_float64",
-        "test_fn_fwgrad_bwgrad_linalg_det_singular_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_det_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_det_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_eig_xpu_complex128",
@@ -1765,7 +1724,6 @@
         "test_fn_fwgrad_bwgrad_linalg_norm_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_float64",
         "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_complex128",
         "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_float64",
@@ -1957,16 +1915,7 @@
         # torch.autograd.gradcheck.GradcheckError: While considering the real part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
         "test_fn_fwgrad_bwgrad_linalg_norm_xpu_complex128",
         # torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
-        "test_fn_fwgrad_bwgrad_linalg_vector_norm_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_masked_normalize_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_norm_inf_xpu_complex128",
-        "test_fn_fwgrad_bwgrad_renorm_xpu_complex128",
         "test_forward_mode_AD_linalg_norm_xpu_complex128",
-        "test_forward_mode_AD_linalg_vector_norm_xpu_complex128",
-        "test_forward_mode_AD_masked_normalize_xpu_complex128",
-        "test_forward_mode_AD_norm_inf_xpu_complex128",
-        "test_forward_mode_AD_renorm_xpu_complex128",
-        "test_inplace_forward_mode_AD_renorm_xpu_complex128",
         # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_float64",
@@ -1976,9 +1925,6 @@
         "test_forward_mode_AD_nn_functional_conv_transpose2d_xpu_float64",
         "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_complex128",
         "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_float64",
-        # RuntimeError: input tensor must have at least one element, but got input_sizes = [1, 0, 1]
-        "test_fn_fwgrad_bwgrad_nn_functional_group_norm_xpu_float64",
-        "test_forward_mode_AD_nn_functional_group_norm_xpu_float64",
         # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0,
         "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64",
         "test_forward_mode_AD_nn_functional_rrelu_xpu_float64",
@@ -2010,11 +1956,6 @@
         "test_scaled_mm_vs_emulated_float16_xpu",
         "test_scaled_mm_vs_emulated_float32_xpu",
         "test_scaled_mm_vs_emulated_row_wise_bfloat16_xpu",
-        # https://github.com/intel/torch-xpu-ops/issues/676
-        # Mismatched elements: 9 / 1003002 (0.0%)
-        # Greatest absolute difference: 711.126220703125 at index (472, 999) (up to 0.1 allowed)
-        # Greatest relative difference: 2.7107455730438232 at index (472, 997) (up to 0.1 allowed)
-        "test_cublas_addmm_size_1000_xpu_float32",
     ),
 
     "test_maskedtensor_xpu.py": (
@@ -2110,6 +2051,7 @@
         "test_reduction_all_sum_layout1_xpu_float16",
         "test_reduction_all_sum_layout1_xpu_float64",
         # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
+        "test_like_",
         "test_invalid_sparse_layout_xpu",
         "test_to_dense_and_sparse_csr_xpu",
         "test_binary_core_add_layout2_xpu_float16",
@@ -2359,7 +2301,6 @@
         "test_fn_gradgrad_linalg_cholesky_xpu_float64",
         "test_fn_gradgrad_linalg_cond_xpu_complex128",
         "test_fn_gradgrad_linalg_cond_xpu_float64",
-        "test_fn_gradgrad_linalg_det_singular_xpu_float64",
         "test_fn_gradgrad_linalg_det_xpu_complex128",
         "test_fn_gradgrad_linalg_det_xpu_float64",
         "test_fn_gradgrad_linalg_eig_xpu_complex128",
@@ -2394,7 +2335,6 @@
         "test_fn_gradgrad_linalg_multi_dot_xpu_float64",
         "test_fn_gradgrad_linalg_pinv_hermitian_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_hermitian_xpu_float64",
-        "test_fn_gradgrad_linalg_pinv_singular_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_singular_xpu_float64",
         "test_fn_gradgrad_linalg_pinv_xpu_complex128",
         "test_fn_gradgrad_linalg_pinv_xpu_float64",
@@ -2486,14 +2426,6 @@
         "test_fn_gradgrad_nn_functional_rrelu_xpu_float64",
         "test_inplace_grad_nn_functional_rrelu_xpu_float64",
         "test_inplace_gradgrad_nn_functional_rrelu_xpu_float64",
-        ### Error #3 in TestBwdGradientsXPU , totally 8 , torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex outputs only, Jacobian mismatch for output 0 with respect to input 0,
-        "test_fn_grad_masked_normalize_xpu_complex128",
-        "test_fn_grad_renorm_xpu_complex128",
-        "test_fn_gradgrad_linalg_vector_norm_xpu_complex128",
-        "test_fn_gradgrad_masked_normalize_xpu_complex128",
-        "test_fn_gradgrad_renorm_xpu_complex128",
-        "test_inplace_grad_renorm_xpu_complex128",
-        "test_inplace_gradgrad_renorm_xpu_complex128",
         ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64",
@@ -2503,8 +2435,6 @@
         "test_fn_gradgrad_nn_functional_conv_transpose2d_xpu_float64",
         "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_complex128",
         "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_float64",
-        ### Error #6 in TestBwdGradientsXPU , totally 5 , torch.autograd.gradcheck.GradcheckError: Backward is not reentrant, i.e., running backward with same input and grad_output multiple times gives different values, although analytical gradient matches numerical gradient.The tolerance for nondeterminism was 0.0.
-        "test_fn_grad_nn_functional_max_pool2d_xpu_float64",
         "test_fn_gradgrad_index_reduce_mean_xpu_float64",
         "test_fn_gradgrad_index_reduce_prod_xpu_float64",
         "test_inplace_gradgrad_index_reduce_mean_xpu_float64",
@@ -2672,7 +2602,6 @@
         "test_multihead_attention_dtype_batch_first_xpu_float64",
         "test_multihead_attention_dtype_xpu_float64",
         "test_multihead_attn_fast_path_query_and_bias_have_different_dtypes_xpu_float64",
-        "test_multihead_attn_fast_path_small_test_xpu_float64",
         "test_multihead_attn_in_proj_bias_none_xpu_float64",
         "test_multihead_attn_in_proj_weight_none_xpu_float64",
     ),