Merge branch 'main' into penghuic/Fixed_regression_issue

intel · Aug 12, 2024 · 309e208 · 309e208
2 parents 91c7a70 + a1657ad
commit 309e208
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 117 deletions.
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -191,7 +191,7 @@ jobs:
         with:
           suite: huggingface
           env_prepare: true
-          dt: float32,bfloat16,float16
+          dt: float32,bfloat16,float16,amp_bf16,amp_fp16
           mode: inference,training
           scenario: accuracy
           hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
@@ -50,11 +50,9 @@
     "test_compare_cpu_exp2_xpu_complex128",
     "test_compare_cpu_exp2_xpu_complex64",
     "test_compare_cpu_nextafter_xpu_bfloat16",
-
     # skip random failure due to accuracy
     # AssertionError: Tensor-likes are not close!
     "test_compare_cpu_atan2_xpu_bfloat16",
-
     # CUDA does not support the data type either
     "test_compare_cpu_native_dropout_backward_xpu_bool",
     "test_compare_cpu_native_dropout_backward_xpu_int16",
@@ -63,59 +61,47 @@
     "test_compare_cpu_native_dropout_backward_xpu_int8",
     "test_compare_cpu_native_dropout_backward_xpu_uint8",
     "test_non_standard_bool_values_native_dropout_backward_xpu_bool",
-
     # Need FP64 golden ref for more accurate comparison
     "test_compare_cpu_log_softmax_xpu_bfloat16",
-
     # TestCompositeCompliance
     # CPU fallback fails
     # Require implementing aten::embedding_renorm_
     "test_view_replay_nn_functional_embedding_xpu_float32",
-
     # TestCompositeCompliance::test_cow_input
     # XPU Tensor fails in copy-on-write cases
     # AssertionError: False is not true : Keyword argument 'output grad 0' during backward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_backward=False` in this operation's OpInfo, add the arg to the OpInfo's `allow_cow_input_materialize_backward` list, or change the implementation to avoid materialization.
     # https://github.com/intel/torch-xpu-ops/issues/281
     "test_cow_input",
-
-
     # XPU implementation is correct.
     # std::exp{-inf, nan}, the result is (±0,±0) (signs are unspecified)
     # std::exp{-inf, inf}, the result is (±0,±0) (signs are unspecified)
     # CPU implementation gets NaN in the cases.
     # https://en.cppreference.com/w/cpp/numeric/complex/exp
     "test_compare_cpu_sigmoid_xpu_complex64",
     "test_compare_cpu_sigmoid_xpu_complex128",
-
     # Align with CUDA dtypes - RuntimeError: "avg_pool2d_out_xpu" not implemented for 'Long'
     "test_compare_cpu_nn_functional_avg_pool2d_xpu_int64",
-
     # Special handle (different calculation order) in CPU reference impl.
     # https://github.com/pytorch/pytorch/blob/c97e3ebb96d7457075b019b94411e8c2d058e68b/aten/src/ATen/native/EmbeddingBag.cpp#L300
     "test_compare_cpu_nn_functional_embedding_bag_xpu_bfloat16",
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float16",
-
     # Not implemented operators, aten::embedding_renorm_.
     # To retrieve cases when the operators are supported.
     # https://github.com/intel/torch-xpu-ops/issues/380
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float32",
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
-
-    #Double and complex datatype matmul is not supported in oneDNN
+    # Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
-
     # CPU reference fail. `abs_cpu` does not support bool.
     # The case should be skipped by PyTorch test infrastructure, but not be
     # skipped correctly after https://github.com/pytorch/pytorch/pull/124147
     # https://github.com/intel/torch-xpu-ops/issues/412
     "test_compare_cpu_abs_xpu_bool",
-
     # bilinear interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_bfloat16",
     "test_compare_cpu_nn_functional_upsample_bilinear_xpu_float16",
-
     # CPU result is not golden reference
     "test_compare_cpu_nn_functional_group_norm_xpu_bfloat16",
     "test_compare_cpu_nn_functional_group_norm_xpu_float16",
@@ -130,25 +116,20 @@
     # Align with CUDA impl by using accumulate type. But CPU doesn't use.
     # When XPU uses original data type, the case passes.
     "test_compare_cpu_logit_xpu_bfloat16",
-
     # precison error
     #     Mismatched elements: 1 / 24 (4.2%)
     # Greatest absolute difference: 0.03125 at index (0, 1, 0, 1) (up to 0.001 allowed)
     # Greatest relative difference: 0.0048828125 at index (0, 1, 0, 1) (up to 0.001 allowed)
     "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_bfloat16",
-
     # RuntimeError: "compute_index_ranges_weights" not implemented for 'Half'
     "test_compare_cpu_nn_functional_interpolate_bilinear_xpu_float16",
-
     # AssertionError: False is not true : Argument 0 during forward call unexpectedly materializes. Either set `supports_cow_input_no_materialize_forward=False...
     "test_cow_input_nn_functional_interpolate_bilinear_xpu_float32",
     "test_cow_input_nn_functional_interpolate_linear_xpu_float32",
     "test_cow_input_nn_functional_interpolate_trilinear_xpu_float32",
-
     #The results of XPU and CUDA are consistent, but the results of CPU and CUDA are inconsistent
     "test_compare_cpu_nn_functional_interpolate_linear_xpu_bfloat16",
     "test_compare_cpu_nn_functional_interpolate_linear_xpu_float16",
-
     # bicubic interpolate includes large calculation steps, accuracy reduces in half-precision
     # Not in CUDA test scope too
     "test_compare_cpu_nn_functional_interpolate_bicubic_xpu_bfloat16",
@@ -157,17 +138,14 @@
     # Retrieve it once the operator is implemented.
     # Error: The operator 'aten::glu_jvp' is not currently implemented for the XPU device.
     "test_forward_ad_nn_functional_glu_xpu_float32",
-
     # Precision error.
     # Mismatched elements: 1 / 812 (0.1%)
     # Greatest absolute difference: 0.03125 at index (610,) (up to 0.001 allowed)
     # Greatest relative difference: 0.00396728515625 at index (610,) (up to 0.001 allowed)
     "test_compare_cpu_hypot_xpu_bfloat16",
-
     # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
     # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
     "test_compare_cpu_polar_xpu_bfloat16",
-
     # Regressions due to PyTorch uplift (Numeric difference in float and bfloat)
     # https://github.com/intel/torch-xpu-ops/issues/549
     # Example fail log
@@ -179,25 +157,21 @@
     "test_compare_cpu_std_mean_xpu_bfloat16",
     "test_compare_cpu_sub_xpu_float16",
     "test_compare_cpu_var_mean_xpu_bfloat16",
-
     # test case doesn't make sense, will file an issue to track it.
     # https://github.com/pytorch/pytorch/issues/130916
     "test_compare_cpu_histogram_xpu_float32",
     "test_compare_cpu_histogram_xpu_float64",
-
     # Precision error.
     # Mismatched elements: 2 / 125 (1.6%)
     # Greatest absolute difference: 0.001953125 at index (2, 0, 0) (up to 0.001 allowed)
     # Greatest relative difference: 0.007568359375 at index (2, 0, 0) (up to 0.001 allowed)
     "test_compare_cpu_cumprod_xpu_bfloat16",
-
     # Precision error.
     # Mismatched elements: 1 / 9 (11.1%)
     # Greatest absolute difference: 0.001953125 at index (2, 2) (up to 0.001 allowed)
     # Greatest relative difference: 0.004669189453125 at index (2, 2) (up to 0.001 allowed)
     # Not in CUDA test scope too
     "test_compare_cpu_prod_xpu_bfloat16 ",
-
     # different results for value index due to unstable sort.
     # XPU and CUDA have the same result.
     "test_compare_cpu_median_xpu_int16",