diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index c67be7dc5..54c2ce75f 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -50,16 +50,7 @@ "test_compare_cpu_exp2_xpu_complex128", "test_compare_cpu_exp2_xpu_complex64", "test_compare_cpu_nextafter_xpu_bfloat16", - # skip random failure due to accuracy - # AssertionError: Tensor-likes are not close! - "test_compare_cpu_atan2_xpu_bfloat16", # CUDA does not support the data type either - "test_compare_cpu_native_dropout_backward_xpu_bool", - "test_compare_cpu_native_dropout_backward_xpu_int16", - "test_compare_cpu_native_dropout_backward_xpu_int32", - "test_compare_cpu_native_dropout_backward_xpu_int64", - "test_compare_cpu_native_dropout_backward_xpu_int8", - "test_compare_cpu_native_dropout_backward_xpu_uint8", "test_non_standard_bool_values_native_dropout_backward_xpu_bool", # Need FP64 golden ref for more accurate comparison "test_compare_cpu_log_softmax_xpu_bfloat16", @@ -79,8 +70,6 @@ # https://en.cppreference.com/w/cpp/numeric/complex/exp "test_compare_cpu_sigmoid_xpu_complex64", "test_compare_cpu_sigmoid_xpu_complex128", - # Align with CUDA dtypes - RuntimeError: "avg_pool2d_out_xpu" not implemented for 'Long' - "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64", # Special handle (different calculation order) in CPU reference impl. # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300 "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16", @@ -93,11 +82,6 @@ "test_view_replay_nn_functional_embedding_bag_xpu_float32", # Double and complex datatype matmul is not supported in oneDNN "test_compare_cpu_cdist_xpu_float64", - # CPU reference fail. `abs_cpu` does not support bool. - # The case should be skipped by PyTorch test infrastructure, but not be - # skipped correctly after https://github.com/pytorch/pytorch/pull/124147 - # https://github.com/intel/torch-xpu-ops/issues/412 - "test_compare_cpu_abs_xpu_bool", # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision # Not in CUDA test scope too "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16", @@ -146,28 +130,11 @@ # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. "test_compare_cpu_polar_xpu_bfloat16", - # Regressions due to PyTorch uplift (Numeric difference in float and bfloat) - # https://github.com/intel/torch-xpu-ops/issues/549 - # Example fail log - # FAILED test_ops_xpu.py::TestCommonXPU::test_compare_cpu_nn_functional_batch_norm_xpu_float16 - AssertionError: Tensor-likes are not close! - # Mismatched elements: 3 / 72 (4.2%) - # Greatest absolute difference: 0.0029296875 at index (0, 1, 1, 0) (up to 0.001 allowed) - # Greatest relative difference: 0.0032501220703125 at index (2, 1, 2, 1) (up to 0.001 allowed) - "test_compare_cpu_nn_functional_batch_norm_xpu_float16", - "test_compare_cpu_std_mean_xpu_bfloat16", - "test_compare_cpu_sub_xpu_float16", - "test_compare_cpu_var_mean_xpu_bfloat16", - # Precision error. - # Mismatched elements: 2 / 125 (1.6%) - # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed) - # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed) - "test_compare_cpu_cumprod_xpu_bfloat16", # Precision error. - # Mismatched elements: 1 / 9 (11.1%) - # Greatest absolute difference: 0.001953125 at index (2, 2) (up to 0.001 allowed) - # Greatest relative difference: 0.004669189453125 at index (2, 2) (up to 0.001 allowed) - # Not in CUDA test scope too - "test_compare_cpu_prod_xpu_bfloat16 ", + # Mismatched elements: 1 / 25 (4.0%) + # Greatest absolute difference: 0.00146484375 at index (0, 0) (up to 0.001 allowed) + # Greatest relative difference: 0.0163116455078125 at index (0, 0) (up to 0.001 allowed) + "test_compare_cpu_sub_xpu_float16", # different results for value index due to unstable sort. # XPU and CUDA have the same result. "test_compare_cpu_median_xpu_int16", diff --git a/test/xpu/extended/test_ops_xpu.py b/test/xpu/extended/test_ops_xpu.py index c32508d2c..e6d46fccb 100644 --- a/test/xpu/extended/test_ops_xpu.py +++ b/test/xpu/extended/test_ops_xpu.py @@ -63,10 +63,20 @@ class Namespace: # Therefore, we build TestCommonProxy by inheriting the TestCommon and TestCase to ensure # the same feature set as the TestCommon. class TestCommonProxy(TestCase, TestCommonBase): - pass + def __init__(self, test_case = None): + if test_case: + # copy custom accuracy setting + self.maxDiff = test_case.maxDiff + self.precision = test_case.precision + self.rel_tol = test_case.rel_tol class TestCompositeComplianceProxy(TestCase, TestCompositeComplianceBase): - pass + def __init__(self, test_case = None): + if test_case: + # copy custom accuracy setting + self.maxDiff = test_case.maxDiff + self.precision = test_case.precision + self.rel_tol = test_case.rel_tol class TestCommon(TestCase): @@ -78,13 +88,13 @@ class TestCommon(TestCase): def test_compare_cpu(self, device, dtype, op): # check if supported both by CPU and XPU if dtype in op.dtypes and dtype in op.supported_dtypes(device): - self.proxy = Namespace.TestCommonProxy() + self.proxy = Namespace.TestCommonProxy(self) test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) test_common_test_fn(self.proxy, device, dtype, op) # for CUDA doesn't support operators elif (op.name in _ops_without_cuda_support): if dtype in op.dtypes: - self.proxy = Namespace.TestCommonProxy() + self.proxy = Namespace.TestCommonProxy(self) test_common_test_fn = get_wrapped_fn(Namespace.TestCommonProxy.test_compare_cpu) test_common_test_fn(self.proxy, device, dtype, op) else: diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py index cf9bd687d..781b6693b 100644 --- a/test/xpu/skip_list_common.py +++ b/test/xpu/skip_list_common.py @@ -697,11 +697,6 @@ "test_conj_view_to_sparse_xpu_complex64", "test_neg_conj_view_to_sparse_xpu_complex128", "test_neg_view_to_sparse_xpu_float64", - # # CPU fallback error:AssertionError: The supported dtypes for nn.functional.pad on device type xpu are incorrect! - # The following dtypes did not work in forward but are listed by the OpInfo: {torch.float16}. - # The following dtypes did not work in backward but are listed by the OpInfo: {torch.float16}. - "test_dtypes_nn_functional_pad_replicate_negative_xpu", - "test_dtypes_nn_functional_pad_replicate_xpu", # Op impl aligns with CUDA on the supported dtypes. # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'. # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed @@ -711,7 +706,6 @@ #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect! #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}. #XPU supports bfloat16, CUDA doesn't support it. - "test_dtypes_unique_consecutive_xpu", "test_dtypes_unique_xpu", # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16. # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error. @@ -1032,10 +1026,8 @@ "test_save_load_nn_TransformerEncoder_train_mode_xpu_float64", "test_save_load_nn_Transformer_xpu_float64", # Unexpected success: - "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32", "test_cpu_gpu_parity_nn_ConvTranspose1d_xpu_complex32", - "test_memory_format_nn_AvgPool2d_xpu_float32", - "test_memory_format_nn_AvgPool2d_xpu_float64", + "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32", # CPU fallback could not cover these # CUDA xfails # Failed: Unexpected success @@ -1104,31 +1096,12 @@ "test_to_nn_TransformerEncoder_eval_mode_swap_True_set_grad_True_xpu_float32", "test_to_nn_TransformerEncoder_train_mode_swap_True_set_grad_True_xpu_float32", "test_to_nn_Transformer_swap_True_set_grad_True_xpu_float32", - #issue 746, adjust tolerence - "test_non_contiguous_tensors_nn_Conv3d_xpu_float32", ), "test_nn_xpu.py": ( # AttributeError: module 'torch.xpu' has no attribute 'FloatTensor' "test_type", # AssertionError: Tensor-likes are not close! - "test_Conv2d_dilated_with_long_tensor_cuda", - "test_Conv2d_groups_thnn_with_long_tensor_cuda", - "test_Conv2d_groups_with_long_tensor_cuda", - "test_Conv2d_no_bias_with_long_tensor_cuda", - "test_Conv2d_padding_with_long_tensor_cuda", - "test_Conv2d_strided_with_long_tensor_cuda", - "test_Conv2d_with_long_tensor_cuda", - "test_Conv3d_1x1x1_no_bias_with_long_tensor_cuda", - "test_Conv3d_groups_with_long_tensor_cuda", - "test_Conv3d_no_bias_with_long_tensor_cuda", - "test_Conv3d_stride_padding_with_long_tensor_cuda", - "test_Conv3d_stride_with_long_tensor_cuda", - "test_Conv3d_with_long_tensor_cuda", - "test_ConvTranspose2d_dilated_with_long_tensor_cuda", - "test_ConvTranspose2d_groups_with_long_tensor_cuda", - "test_ConvTranspose2d_no_bias_with_long_tensor_cuda", - "test_ConvTranspose2d_with_long_tensor_cuda", "test_RReLU_cuda", "test_RReLU_no_batch_dim_cuda", "test_RReLU_with_up_down_cuda", @@ -1176,10 +1149,6 @@ # AssertionError: False is not true "test_ctc_loss_cudnn_xpu", # want "xpu" in function name "test_ctc_loss_cudnn_tensor", # want "xpu" in function name - # RuntimeError: "smooth_l1_backward_cpu_out" not implemented for 'Half' - "test_SmoothL1Loss_no_batch_dim_mean_cuda_half", - "test_SmoothL1Loss_no_batch_dim_none_cuda_half", - "test_SmoothL1Loss_no_batch_dim_sum_cuda_half", # RuntimeError: "multilabel_margin_loss_forward_out_frame" not implemented for 'Half' "test_MultiLabelMarginLoss_no_batch_dim_mean_cuda_half", "test_MultiLabelMarginLoss_no_batch_dim_none_cuda_half", @@ -1230,8 +1199,6 @@ # CPU only (vs Numpy). CUDA skips these cases since non-deterministic results are outputed for inf and nan. "test_float_to_int_conversion_finite_xpu_int8", "test_float_to_int_conversion_finite_xpu_int16", - # sparse - "test_tensor_ctor_device_inference_xpu", # Dispatch issue. It is a composite operator. But it is implemented by # DispatchStub. XPU doesn't support DispatchStub. "test_kaiser_window_xpu", @@ -1316,11 +1283,6 @@ "test_reference_numerics_large_asinh_xpu_complex128", "test_reference_numerics_large_asinh_xpu_complex64", "test_reference_numerics_large_asinh_xpu_complex32", - # Mismatched elements: 1 / 943593 (0.0%) - # Greatest absolute difference: 1.3363442121772096e-05 at index (742, 249) (up to 1e-05 allowed) - # Greatest relative difference: 8.852276550896931e-06 at index (742, 249) (up to 1.3e-06 allowed) - "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64", - # AssertionError: Tensor-likes are not close! # exceeded maximum allowed difference # Greatest absolute difference: 6.266784475883469e-05 at index (463, 204) (up to 1e-05 allowed) @@ -1468,8 +1430,6 @@ # https://github.com/intel/torch-xpu-ops/issues/275 # NotImplementedError: Could not run 'aten::empty_quantized' with arguments from the 'QuantizedXPU' backend. "test_flip_xpu_float32", - # RuntimeError: "trace" not implemented for 'Half' - "test_trace_xpu_float16", ), "test_content_store_xpu.py": None, @@ -1729,7 +1689,6 @@ "test_fn_fwgrad_bwgrad_linalg_cholesky_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_cond_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_cond_xpu_float64", - "test_fn_fwgrad_bwgrad_linalg_det_singular_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_det_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_det_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_eig_xpu_complex128", @@ -1765,7 +1724,6 @@ "test_fn_fwgrad_bwgrad_linalg_norm_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_pinv_hermitian_xpu_float64", - "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_pinv_singular_xpu_float64", "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_complex128", "test_fn_fwgrad_bwgrad_linalg_pinv_xpu_float64", @@ -1957,16 +1915,7 @@ # torch.autograd.gradcheck.GradcheckError: While considering the real part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0, "test_fn_fwgrad_bwgrad_linalg_norm_xpu_complex128", # torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex inputs only, Jacobian computed with forward mode mismatch for output 0 with respect to input 0, - "test_fn_fwgrad_bwgrad_linalg_vector_norm_xpu_complex128", - "test_fn_fwgrad_bwgrad_masked_normalize_xpu_complex128", - "test_fn_fwgrad_bwgrad_norm_inf_xpu_complex128", - "test_fn_fwgrad_bwgrad_renorm_xpu_complex128", "test_forward_mode_AD_linalg_norm_xpu_complex128", - "test_forward_mode_AD_linalg_vector_norm_xpu_complex128", - "test_forward_mode_AD_masked_normalize_xpu_complex128", - "test_forward_mode_AD_norm_inf_xpu_complex128", - "test_forward_mode_AD_renorm_xpu_complex128", - "test_inplace_forward_mode_AD_renorm_xpu_complex128", # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_complex128", "test_fn_fwgrad_bwgrad_nn_functional_conv_transpose2d_xpu_float64", @@ -1976,9 +1925,6 @@ "test_forward_mode_AD_nn_functional_conv_transpose2d_xpu_float64", "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_complex128", "test_forward_mode_AD_nn_functional_conv_transpose3d_xpu_float64", - # RuntimeError: input tensor must have at least one element, but got input_sizes = [1, 0, 1] - "test_fn_fwgrad_bwgrad_nn_functional_group_norm_xpu_float64", - "test_forward_mode_AD_nn_functional_group_norm_xpu_float64", # torch.autograd.gradcheck.GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0, "test_fn_fwgrad_bwgrad_nn_functional_rrelu_xpu_float64", "test_forward_mode_AD_nn_functional_rrelu_xpu_float64", @@ -2010,11 +1956,6 @@ "test_scaled_mm_vs_emulated_float16_xpu", "test_scaled_mm_vs_emulated_float32_xpu", "test_scaled_mm_vs_emulated_row_wise_bfloat16_xpu", - # https://github.com/intel/torch-xpu-ops/issues/676 - # Mismatched elements: 9 / 1003002 (0.0%) - # Greatest absolute difference: 711.126220703125 at index (472, 999) (up to 0.1 allowed) - # Greatest relative difference: 2.7107455730438232 at index (472, 997) (up to 0.1 allowed) - "test_cublas_addmm_size_1000_xpu_float32", ), "test_maskedtensor_xpu.py": ( @@ -2110,6 +2051,7 @@ "test_reduction_all_sum_layout1_xpu_float16", "test_reduction_all_sum_layout1_xpu_float64", # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta + "test_like_", "test_invalid_sparse_layout_xpu", "test_to_dense_and_sparse_csr_xpu", "test_binary_core_add_layout2_xpu_float16", @@ -2359,7 +2301,6 @@ "test_fn_gradgrad_linalg_cholesky_xpu_float64", "test_fn_gradgrad_linalg_cond_xpu_complex128", "test_fn_gradgrad_linalg_cond_xpu_float64", - "test_fn_gradgrad_linalg_det_singular_xpu_float64", "test_fn_gradgrad_linalg_det_xpu_complex128", "test_fn_gradgrad_linalg_det_xpu_float64", "test_fn_gradgrad_linalg_eig_xpu_complex128", @@ -2394,7 +2335,6 @@ "test_fn_gradgrad_linalg_multi_dot_xpu_float64", "test_fn_gradgrad_linalg_pinv_hermitian_xpu_complex128", "test_fn_gradgrad_linalg_pinv_hermitian_xpu_float64", - "test_fn_gradgrad_linalg_pinv_singular_xpu_complex128", "test_fn_gradgrad_linalg_pinv_singular_xpu_float64", "test_fn_gradgrad_linalg_pinv_xpu_complex128", "test_fn_gradgrad_linalg_pinv_xpu_float64", @@ -2486,14 +2426,6 @@ "test_fn_gradgrad_nn_functional_rrelu_xpu_float64", "test_inplace_grad_nn_functional_rrelu_xpu_float64", "test_inplace_gradgrad_nn_functional_rrelu_xpu_float64", - ### Error #3 in TestBwdGradientsXPU , totally 8 , torch.autograd.gradcheck.GradcheckError: While considering the imaginary part of complex outputs only, Jacobian mismatch for output 0 with respect to input 0, - "test_fn_grad_masked_normalize_xpu_complex128", - "test_fn_grad_renorm_xpu_complex128", - "test_fn_gradgrad_linalg_vector_norm_xpu_complex128", - "test_fn_gradgrad_masked_normalize_xpu_complex128", - "test_fn_gradgrad_renorm_xpu_complex128", - "test_inplace_grad_renorm_xpu_complex128", - "test_inplace_gradgrad_renorm_xpu_complex128", ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128", "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64", @@ -2503,8 +2435,6 @@ "test_fn_gradgrad_nn_functional_conv_transpose2d_xpu_float64", "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_complex128", "test_fn_gradgrad_nn_functional_conv_transpose3d_xpu_float64", - ### Error #6 in TestBwdGradientsXPU , totally 5 , torch.autograd.gradcheck.GradcheckError: Backward is not reentrant, i.e., running backward with same input and grad_output multiple times gives different values, although analytical gradient matches numerical gradient.The tolerance for nondeterminism was 0.0. - "test_fn_grad_nn_functional_max_pool2d_xpu_float64", "test_fn_gradgrad_index_reduce_mean_xpu_float64", "test_fn_gradgrad_index_reduce_prod_xpu_float64", "test_inplace_gradgrad_index_reduce_mean_xpu_float64", @@ -2672,7 +2602,6 @@ "test_multihead_attention_dtype_batch_first_xpu_float64", "test_multihead_attention_dtype_xpu_float64", "test_multihead_attn_fast_path_query_and_bias_have_different_dtypes_xpu_float64", - "test_multihead_attn_fast_path_small_test_xpu_float64", "test_multihead_attn_in_proj_bias_none_xpu_float64", "test_multihead_attn_in_proj_weight_none_xpu_float64", ), diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py index 33c997ec7..d659197d9 100644 --- a/test/xpu/test_decomp_xpu.py +++ b/test/xpu/test_decomp_xpu.py @@ -1,5 +1,6 @@ # Owner(s): ["module: intel"] +import torch from torch.testing._internal.common_device_type import instantiate_device_type_tests from torch.testing._internal.common_utils import run_tests @@ -9,8 +10,72 @@ from .xpu_test_utils import XPUPatchForImport with XPUPatchForImport(False): + import test_decomp from test_decomp import TestDecomp,DecompOneOffTests +def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs): + assert orig.dtype == decomp.dtype, f"{i} Operation: {op}" + if orig.numel() == 0 or decomp.numel() == 0: + assert orig.numel() == decomp.numel() + return + assert orig.shape == decomp.shape, f"{i} Operation: {op}" + tol_table = { + (torch.bfloat16, torch.ops.aten.native_layer_norm.default): 1e-5, + (torch.float16, torch.ops.aten.native_layer_norm.default): 1e-5, + (torch.float16, torch.ops.aten.native_layer_norm_backward.default): 1e-3, + (torch.bfloat16, torch.ops.aten.native_layer_norm_backward.default): 2e-2, + (torch.bfloat16, torch.ops.aten.native_batch_norm.default): 1e-5, + (torch.float16, torch.ops.aten.native_batch_norm.default): 1e-5, + (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.default): 1e-5, + (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5, + (torch.float16, torch.ops.aten._native_batch_norm_legit.default): 1e-5, + (torch.float16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5, + (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-4, + (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-4, + (torch.bfloat16, torch.ops.aten.var_mean.correction): 5e-7, + (torch.float16, torch.ops.aten.var_mean.correction): 5e-7, + (torch.bfloat16, torch.ops.aten.var_mean.dim): 5e-7, + (torch.float16, torch.ops.aten.var_mean.dim): 5e-7, + (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2, + (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1, + (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2, + (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1, + (torch.float16, torch.ops.aten.hardswish.default): 2e-7, + (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7, + (torch.float16, torch.ops.aten.multi_margin_loss.default): 3e-2, + (torch.bfloat16, torch.ops.aten.multi_margin_loss.default): 5e-2, + (torch.float16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2, + (torch.bfloat16, torch.ops.aten.multilabel_margin_loss_forward.default): 3e-2, + (torch.float16, torch.ops.aten.reflection_pad1d_backward.default): 5e-3, + (torch.bfloat16, torch.ops.aten.reflection_pad1d_backward.default): 5e-3, + (torch.float16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3, + (torch.bfloat16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3, + (torch.float16, torch.ops.aten.reflection_pad3d_backward.default): 5e-3, + (torch.bfloat16, torch.ops.aten.reflection_pad3d_backward.default): 5e-2, + # see https://github.com/pytorch/pytorch/pull/96264 + (torch.float16, torch.ops.aten.mv.default): 1e-5, + (torch.bfloat16, torch.ops.aten.mv.default): 1e-5, + (torch.float16, torch.ops.aten.log_sigmoid_backward.default): 2e-5, + (torch.float16, torch.ops.aten._batch_norm_with_update.default): 2e-7, # adjust tolerance for xpu, so hook this func + (torch.bfloat16, torch.ops.aten._batch_norm_with_update.default): 2e-7, # adjust tolerance for xpu, so hook this func + } + if ref.is_floating_point(): + orig_diff = (orig - ref).abs().max() + decomp_diff = (decomp - ref).abs().max() + atol = tol_table.get((test_dtype, op), 1e-7) + if decomp_diff > orig_diff + atol: + raise RuntimeError( + f"Difference from float64 is larger with decomposition {op.__name__}" + f" than original on output {i}. Original max diff: {orig_diff}, Decomp max diff: {decomp_diff}\n" + f"atol = {atol}\n" + f"args = {args}\n" + f"kwargs = {kwargs}" + ) + else: + test_case.assertEqual( + orig, decomp, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}" + ) +test_decomp.op_assert_ref=_op_assert_ref instantiate_device_type_tests(TestDecomp, globals(), only_for="xpu", allow_xpu=True) instantiate_device_type_tests(DecompOneOffTests, globals(), only_for="xpu", allow_xpu=True) diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py index 80fb3c8b0..1a4a57a41 100644 --- a/test/xpu/test_torch_xpu.py +++ b/test/xpu/test_torch_xpu.py @@ -6974,11 +6974,7 @@ def helper(dim, dtype, device, size_result, size_source): ref_out = tensor.index_add(dim, index, source, alpha=2.) / 2. ref_out = ref_out.to(dtype=dtype) out = tensor.index_add(dim, index, source) - if device == 'cuda' or device == 'xpu': - self.assertEqual(out, ref_out, atol=1e-2, rtol=1e-2) - else: - # scatter_add uses fp32 as accumulate type, while index_add doesn't. - self.assertEqual(out, ref_out.to(dtype=dtype), atol=1e-2, rtol=1e-2) + self.assertEqual(out, ref_out, atol=7e-2, rtol=1.2e-2) for dim in [-1, -2, -3]: for dtype in all_types_and_complex_and(torch.half, torch.bfloat16): diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index 1c7dbcdff..c6c239cca 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -14,6 +14,7 @@ common_methods_invocations, common_utils, ) +from torch.testing._internal.common_device_type import tol, toleranceOverride from torch.testing._internal.common_modules import module_db from torch.testing._internal.common_nn import CriterionTest, ModuleTest from torch.testing._internal.common_utils import set_default_dtype @@ -255,11 +256,85 @@ ("logcumsumexp", "test_out_warning"), ("_refs.mul", "test_python_ref"), ("_refs.mul", "test_python_ref_torch_fallback"), + ("nn.AvgPool2d", "test_memory_format"), ("narrow_copy","test_meta_outplace"), ("narrow_copy","test_dispatch_meta_outplace"), ("narrow_copy","test_dispatch_symbolic_meta_outplace"), ] +# some case should adjust tolerance to pass. +# The new threshold is at the same order of magnitude as cuda's or cpu's. +# format hint:{op_name:{(cls_name,test_name):{dtype:tol(atol, rtol)}} + +_xpu_tolerance_override = { + "nn.functional.tanhshrink": { + ("TestUnaryUfuncs", "test_reference_numerics_normal"): { + torch.complex64: tol(atol=2e-05, rtol=9e-06), + torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02), + } + }, + "atan2": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.008, rtol=0.005), + } + }, + "cumprod": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.002, rtol=0.008), + } + }, + "nanmean": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.002, rtol=0.008), + } + }, + "nansum": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.008, rtol=0.006), + } + }, + "nn.functional.batch_norm": { + ("TestCommon", "test_compare_cpu"): { + torch.float16: tol(atol=0.003, rtol=0.004), + } + }, + "nn.functional.embedding_bag": { + ("TestCommon", "test_compare_cpu"): { + torch.float16: tol(atol=0.005, rtol=0.007), + } + }, + "nn.functional.group_norm": { + ("TestCommon", "test_compare_cpu"): { + torch.float16: tol(atol=0.002, rtol=0.006), + } + }, + "prod": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.002, rtol=0.005), + } + }, + "rsqrt": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.004, rtol=0.007), + } + }, + "std_mean": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.008, rtol=0.005), + } + }, + "var_mean": { + ("TestCommon", "test_compare_cpu"): { + torch.bfloat16: tol(atol=0.008, rtol=0.005), + } + }, + "nn.LazyConvTranspose3d": { + ("TestModule", "test_non_contiguous_tensors"): { + torch.float32: tol(atol=2e-5, rtol=5e-5), + } + }, +} + def get_wrapped_fn(fn): if hasattr(fn, "__wrapped__"): @@ -296,6 +371,8 @@ def to_xpu(obj, type_map=None): def ModuleTest_test_xpu(self, test_case): + if not self.should_test_cuda: + raise unittest.SkipTest("Excluded from XPU tests") with set_default_dtype(self.default_dtype): cpu_input = self._get_input() @@ -320,13 +397,6 @@ def ModuleTest_test_xpu(self, test_case): test_case._zero_grad_parameters(xpu_module) cpu_output = test_case._forward(cpu_module, cpu_input_tuple) xpu_output = test_case._forward(xpu_module, xpu_input_tuple) - test_case.assertEqual( - cpu_input_tuple, - xpu_input_tuple, - atol=self.precision, - rtol=0, - exact_dtype=False, - ) if getattr(cpu_module, "return_indices", False): cpu_output = cpu_output[0] xpu_output = xpu_output[0] @@ -334,18 +404,11 @@ def ModuleTest_test_xpu(self, test_case): cpu_output, xpu_output, atol=self.precision, rtol=0, exact_dtype=False ) - # Run backwards on CPU and GPU and compare results + # Run backwards on CPU and xpu and compare results for _ in range(5): cpu_gradOutput = cpu_output.clone().normal_() xpu_gradOutput = cpu_gradOutput.type_as(xpu_output) - test_case.assertEqual( - cpu_input_tuple, - xpu_input_tuple, - atol=self.precision, - rtol=0, - exact_dtype=False, - ) cpu_gradInput = test_case._backward( cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput ) @@ -361,7 +424,7 @@ def ModuleTest_test_xpu(self, test_case): ) for cpu_d_p, xpu_d_p in zip(cpu_param[1], xpu_param[1]): test_case.assertEqual(cpu_d_p, xpu_d_p, atol=self.precision, rtol=0) - # Run double-backwards on CPU and GPU and compare results + # Run double-backwards on CPU and xpu and compare results if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison: cpu_output = cpu_module(*cpu_input_tuple) @@ -441,6 +504,8 @@ def convert_dtype(obj, dtype, requires_grad=False): else: return obj + if not self.should_test_cuda: + raise unittest.SkipTest("Excluded from XPU tests") with set_default_dtype(self.default_dtype): cpu_input = self._get_input() cpu_target = self._get_target() @@ -539,7 +604,7 @@ def __init__(self, patch_test_case=True) -> None: self.cuda_is_bf16_supported = cuda.is_bf16_supported def align_db_decorators(self, db): - def gen_xpu_wrappers(name, wrappers): + def gen_xpu_wrappers(op_name, wrappers): wrapper_xpu = [] replaced = False for wrapper in wrappers: @@ -547,7 +612,7 @@ def gen_xpu_wrappers(name, wrappers): if wrapper.device_type == "cuda": if ( unittest.expectedFailure in wrapper.decorators - and (name, wrapper.test_name) in _cuda_xfail_xpu_pass + and (op_name, wrapper.test_name) in _cuda_xfail_xpu_pass ): pass else: @@ -562,6 +627,19 @@ def gen_xpu_wrappers(name, wrappers): for info in db: if hasattr(info, "decorators"): replaced, decorator_xpu = gen_xpu_wrappers(info.name, info.decorators) + + # the latter decorator will override the former. + if info.name in _xpu_tolerance_override: + replaced = True + for case, tolerance in _xpu_tolerance_override[info.name].items(): + decorator_xpu.append( + DecorateInfo( + toleranceOverride(tolerance), + case[0], # cls_name + case[1], # test_name + device_type="xpu", + ) + ) if replaced: info.decorators = tuple(decorator_xpu) if hasattr(info, "skips"):