From 7b266fce58eadb4164f4631f303ad6ec764cc128 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Tue, 3 Sep 2024 20:43:17 +0800
Subject: [PATCH] refined align_supported_dtypes and test_ops.py skip list
 (#779)

1. updated align_supported_dyptes()
a) if forward does not support bfloat16, do not add it for backward
b) for _refs.xxx, if op xxx is supported by torch-xpu-ops, also align
the backward dtypes with CUDA.
2. added more ops in _xpu_computation_op_list to align the dtypes with
cuda
3. updated skip list:
a) Added some cases in skip list because with 1 and 2 more cases are
added, while some cases are XFAIL with CUDA but passed with XPU.
b) Removed cases from skip list:
        # No this case anymore
        # "test_python_ref__refs_linspace_tensor_overload_xpu_int16",
        # "test_python_ref__refs_linspace_tensor_overload_xpu_int32",
        # "test_python_ref__refs_linspace_tensor_overload_xpu_int64",
        # "test_python_ref__refs_linspace_tensor_overload_xpu_int8",
        # "test_python_ref__refs_linspace_tensor_overload_xpu_uint8",
        # "test_python_ref__refs_linspace_xpu_int16",
        # "test_python_ref__refs_linspace_xpu_int32",
        # "test_python_ref__refs_linspace_xpu_int64",
        # "test_python_ref__refs_linspace_xpu_int8",
        # "test_python_ref__refs_linspace_xpu_uint8",
        # "test_python_ref__refs_logaddexp_xpu_complex128",
        # "test_python_ref__refs_logaddexp_xpu_complex64",
        # "test_python_ref__refs_native_layer_norm_xpu_bfloat16",
        # "test_python_ref__refs_native_layer_norm_xpu_float16",
        # "test_python_ref__refs_native_layer_norm_xpu_float32",
#
"test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
#
"test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_float16",
#
"test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
# "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_float16",
# "test_python_ref__refs_nn_functional_triplet_margin_loss_xpu_uint8",
        # "test_python_ref__refs_square_xpu_bool",
        # "test_python_ref__refs_trunc_xpu_float64",

        # skipped
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_bfloat16",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_float16",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_float32",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_float64",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_int16",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_int32",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_int64",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_int8",
# "test_python_ref_executor__refs_geometric_executor_aten_xpu_uint8",
# "test_python_ref_executor__refs_linspace_executor_aten_xpu_int16",
# "test_python_ref_executor__refs_linspace_executor_aten_xpu_int32",
# "test_python_ref_executor__refs_linspace_executor_aten_xpu_int64",
# "test_python_ref_executor__refs_linspace_executor_aten_xpu_int8",
# "test_python_ref_executor__refs_linspace_executor_aten_xpu_uint8",
#
"test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int16",
#
"test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int32",
#
"test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int64",
#
"test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int8",
#
"test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_uint8",
#
"test_python_ref_executor__refs_log_normal_executor_aten_xpu_bfloat16",
# "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float16",
# "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float32",
# "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float64",

        # skipped
#
"test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_bfloat16",
#
"test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float16",

        # need to add native_layer_norm to list

#"test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float32",

        # skipped
#
"test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_bfloat16",
#
"test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float16",
#
"test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float32",
#
"test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float64",

        # skipped
#
"test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_bfloat16",
#
"test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_float16",
#
"test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_bfloat16",
#
"test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_float16",
#
"test_python_ref_executor__refs_nn_functional_triplet_margin_loss_executor_aten_xpu_uint8",

 # skipped
# "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex128",
# "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex64",

        # skipped
#
"test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int16",
#
"test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int32",
#
"test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int64",
#
"test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int8",
#
"test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_uint8",
        # "test_python_ref_torch_fallback__refs_linspace_xpu_int16",
        # "test_python_ref_torch_fallback__refs_linspace_xpu_int32",
        # "test_python_ref_torch_fallback__refs_linspace_xpu_int64",
        # "test_python_ref_torch_fallback__refs_linspace_xpu_int8",
# "test_python_ref_torch_fallback__refs_linspace_xpu_uint8",
# "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex128",
# "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex64",
# "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_bfloat16",
# "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float16",

        # skipped? added native_layer_norm in op list
#"test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float32",

        # skipped
#
"test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
#
"test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_float16",
#
"test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
#
"test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_float16",
# "test_python_ref_torch_fallback__refs_sinh_xpu_complex128",
#
"test_python_ref_torch_fallback__refs_special_multigammaln_mvlgamma_p_5_xpu_int32",

        # skipped?
        #"test_python_ref_torch_fallback__refs_square_xpu_bool",

        # skipped
        # "test_python_ref_torch_fallback__refs_vdot_xpu_complex128",
        # "test_python_ref_torch_fallback__refs_vdot_xpu_complex64",

        # skipped
        #"test_compare_cpu__refs_special_zeta_xpu_float32",

        # skipped ?
        #"test_python_ref__refs_heaviside_xpu_int64",

        # skipped
        # "test_python_ref__refs_special_bessel_j0_xpu_int64",
        # "test_python_ref_errors__refs_dstack_xpu",
        # "test_python_ref_errors__refs_hstack_xpu",
        # "test_python_ref_errors__refs_linalg_cross_xpu",
        # "test_python_ref_errors__refs_vstack_xpu",
# "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32",
# "test_python_ref__refs_special_multigammaln_mvlgamma_p_5_xpu_float64",
#
"test_python_ref_executor__refs_special_multigammaln_mvlgamma_p_3_executor_aten_xpu_float64",
        # "test_python_ref__refs_square_xpu_complex128",
        # "test_python_ref__refs_square_xpu_complex64",
# "test_python_ref_executor__refs_istft_executor_aten_xpu_complex128",
# "test_python_ref_executor__refs_square_executor_aten_xpu_complex128",
        # "test_python_ref_torch_fallback__refs_square_xpu_complex128",
        # "test_python_ref_torch_fallback__refs_square_xpu_complex64",


#Fixed with xpu_test_utils.py and not add bfloat16 in backward if
bloat16 is not enabled in forward
#"test_dtypes_view_as_complex_xpu", # Didn't align with CUDA, The
following dtypes did not work in backward but are listed by the OpInfo:
{torch.bfloat16}
#"test_dtypes_view_as_real_xpu", # Didn't align with CUDA, The following
dtypes did not work in backward but are listed by the OpInfo:
{torch.bfloat16}

        # no this test now
#"test_noncontiguous_samples_native_dropout_backward_xpu_int64", # The
implementation aligns with CUDA, RuntimeError: "masked_scale" not
implemented for 'Long'.
#"test_non_standard_bool_values_native_dropout_backward_xpu_bool", # The
implementation aligns with CUDA, RuntimeError: "masked_scale" not
implemented for 'Bool'.
#"test_non_standard_bool_values_scatter_reduce_amax_xpu_bool", # Align
with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for
'Bool'
#"test_non_standard_bool_values_scatter_reduce_amin_xpu_bool", # Align
with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for
'Bool'
#"test_non_standard_bool_values_scatter_reduce_prod_xpu_bool", # Align
with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for
'Bool'

        # skipped by addeding argsort and sort to list
#"test_non_standard_bool_values_argsort_xpu_bool", # The implementation
aligns with CUDA, RuntimeError: "argsort" not implemented for 'Bool'.

        # skipped
        #"test_out_triangular_solve_xpu_float32",

# add square in op list, this op is defined in aten, no need backend.
        #"test_python_ref_executor__refs_square_executor_aten_xpu_bool",

       # need to add native_layer_norm to list

#"test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float32",

---------

Co-authored-by: Huaiyu, Zheng <huaiyu.zheng@intel.com>
---
 test/xpu/extended/skip_list_common.py |  16 ++
 test/xpu/skip_list_common.py          | 246 +++++++++++---------------
 test/xpu/test_indexing_xpu.py         |  31 ++++
 test/xpu/xpu_test_utils.py            |  17 +-
 4 files changed, 166 insertions(+), 144 deletions(-)

diff --git a/test/xpu/extended/skip_list_common.py b/test/xpu/extended/skip_list_common.py
index 4e9be7dbd..d754c9150 100644
--- a/test/xpu/extended/skip_list_common.py
+++ b/test/xpu/extended/skip_list_common.py
@@ -146,5 +146,21 @@
     "test_compare_cpu_nanmedian_xpu_int64",
     "test_compare_cpu_nanmedian_xpu_int8",
     "test_compare_cpu_nanmedian_xpu_uint8",
+
+    # sort algorithm is different to cpu
+    "test_compare_cpu_argsort_xpu_bfloat16",
+    "test_compare_cpu_argsort_xpu_float16",
+    "test_compare_cpu_argsort_xpu_int8",
+    "test_compare_cpu_argsort_xpu_uint8",
+
+    # AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.bfloat16
+    # https://github.com/intel/torch-xpu-ops/issues/780
+    "test_compare_cpu_native_layer_norm_xpu_bfloat16",
+    "test_compare_cpu_native_layer_norm_xpu_float16",
+
+    # AssertionError: Tensor-likes are not close!
+    # https://github.com/intel/torch-xpu-ops/issues/781
+    "test_compare_cpu_square_xpu_complex64",
+
     ),
 }
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 6be0c624c..6b5df037b 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -1,6 +1,9 @@
 skip_dict = {
     "test_ops_xpu.py": (
         # Skip list of base line
+
+        # Need to revisit when the ops are enabled
+        # AssertionError: The supported dtypes for xxx on device type xpu are incorrect! 
         "test_dtypes___rmod___xpu",
         "test_dtypes_nn_functional_conv1d_xpu",
         "test_dtypes_nn_functional_conv2d_xpu",
@@ -10,176 +13,122 @@
         "test_dtypes_nn_functional_conv_transpose3d_xpu",
         "test_dtypes_nn_functional_softsign_xpu",
         "test_dtypes_sparse_sampled_addmm_xpu",
-        "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
-        "test_compare_cpu_to_sparse_xpu_float32",
+        # AssertionError: RuntimeError not raised
         "test_errors_dot_xpu",
-        "test_errors_kthvalue_xpu",
-        "test_errors_sparse_mul_layout0_xpu",
-        "test_errors_sparse_mul_layout1_xpu",
-        "test_errors_sparse_mul_layout2_xpu",
-        "test_errors_sparse_mul_layout3_xpu",
+        "test_errors_kthvalue_xpu",     
         "test_errors_take_xpu",
         "test_errors_vdot_xpu",
-        "test_non_standard_bool_values_to_sparse_xpu_bool",
+        # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close!
         "test_noncontiguous_samples_linalg_det_xpu_float32",
         "test_noncontiguous_samples_linalg_slogdet_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_ex_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_xpu_float32",
         "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32",
         "test_noncontiguous_samples_logdet_xpu_float32",
-        "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64",
+        "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64",
-        "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64",
+        "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32",
+
+        # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
+        # https://github.com/intel/torch-xpu-ops/issues/357
+        "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
+        "test_errors_sparse_mul_layout0_xpu",
+        "test_errors_sparse_mul_layout1_xpu",
+        "test_errors_sparse_mul_layout2_xpu",
+        "test_errors_sparse_mul_layout3_xpu",
+        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
+        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",      
+
+        # NotImplementedError: Could not run 'aten::_to_dense' with arguments from the 'SparseXPU' backend.
+        # https://github.com/intel/torch-xpu-ops/issues/357
+        "test_compare_cpu_to_sparse_xpu_float32",
+        "test_variant_consistency_eager_to_sparse_xpu_float32",
+
+        # RuntimeError: sparse_dim expected sparse or strided tensor layout but got Sparse
+        # Issue https://github.com/intel/torch-xpu-ops/issues/357
+        "test_variant_consistency_eager_to_sparse_xpu_complex64",
+        "test_non_standard_bool_values_to_sparse_xpu_bool",        
+        
+        # OneDNN issues, https://github.com/intel/torch-xpu-ops/issues/253
+        # RuntimeError: Long is not supported in oneDNN! 
+        # RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
+        # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
+        "test_noncontiguous_samples_nn_functional_conv3d_xpu_int64",        
+        "test_noncontiguous_samples_nn_functional_conv_transpose1d_xpu_int64",        # 
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_complex64",
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv_transpose2d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_complex64",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_float32",
         "test_noncontiguous_samples_nn_functional_conv_transpose3d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_rrelu_xpu_float32",
+        "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
+        "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
+        
+        # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu
+        # Issue https://github.com/intel/torch-xpu-ops/issues/327
         "test_numpy_ref_linalg_tensorinv_xpu_float64",
         "test_out_mode_xpu_float32",
+
+        # RuntimeError: false INTERNAL ASSERT FAILED at "/home/gta/daisyden/pytorch4/aten/src/ATen/native/DispatchStub.cpp":220, please report a bug to PyTorch. DispatchStub: missing kernel for xpu
         "test_out_nanmean_xpu_float32",
-        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_complex64",
-        "test_out_requires_grad_error_sparse_sampled_addmm_xpu_float32",
-        "test_out_warning_nanmean_xpu",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int16",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int32",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int64",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_int8",
-        "test_python_ref__refs_linspace_tensor_overload_xpu_uint8",
-        "test_python_ref__refs_linspace_xpu_int16",
-        "test_python_ref__refs_linspace_xpu_int32",
-        "test_python_ref__refs_linspace_xpu_int64",
-        "test_python_ref__refs_linspace_xpu_int8",
-        "test_python_ref__refs_linspace_xpu_uint8",
-        "test_python_ref__refs_logaddexp_xpu_complex128",
-        "test_python_ref__refs_logaddexp_xpu_complex64",
-        "test_python_ref__refs_native_layer_norm_xpu_bfloat16",
-        "test_python_ref__refs_native_layer_norm_xpu_float16",
-        "test_python_ref__refs_native_layer_norm_xpu_float32",
-        "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
-        "test_python_ref__refs_nn_functional_hinge_embedding_loss_xpu_float16",
-        "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
-        "test_python_ref__refs_nn_functional_margin_ranking_loss_xpu_float16",
-        "test_python_ref__refs_nn_functional_triplet_margin_loss_xpu_uint8",
-        "test_python_ref__refs_square_xpu_bool",
-        "test_python_ref__refs_trunc_xpu_float64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_float64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_geometric_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_linspace_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int16",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int32",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int64",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_int8",
-        "test_python_ref_executor__refs_linspace_tensor_overload_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_log_normal_executor_aten_xpu_float64",
+        "test_out_warning_nanmean_xpu",  
+
+        # NameError: name 'nanj' is not defined. Did you mean: 'nan'?
+        # https://github.com/intel/torch-xpu-ops/issues/768
         "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex128",
         "test_python_ref_executor__refs_logaddexp_executor_aten_xpu_complex64",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_native_layer_norm_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float32",
-        "test_python_ref_executor__refs_nn_functional_alpha_dropout_executor_aten_xpu_float64",
-        "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_hinge_embedding_loss_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_bfloat16",
-        "test_python_ref_executor__refs_nn_functional_margin_ranking_loss_executor_aten_xpu_float16",
-        "test_python_ref_executor__refs_nn_functional_triplet_margin_loss_executor_aten_xpu_uint8",
-        "test_python_ref_executor__refs_square_executor_aten_xpu_bool",
-        "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex128",
-        "test_python_ref_executor__refs_vdot_executor_aten_xpu_complex64",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int16",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int32",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int64",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_int8",
-        "test_python_ref_torch_fallback__refs_linspace_tensor_overload_xpu_uint8",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int16",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int32",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int64",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_int8",
-        "test_python_ref_torch_fallback__refs_linspace_xpu_uint8",
-        "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_logaddexp_xpu_complex64",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float16",
-        "test_python_ref_torch_fallback__refs_native_layer_norm_xpu_float32",
-        "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_nn_functional_hinge_embedding_loss_xpu_float16",
-        "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_bfloat16",
-        "test_python_ref_torch_fallback__refs_nn_functional_margin_ranking_loss_xpu_float16",
-        "test_python_ref_torch_fallback__refs_sinh_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_special_multigammaln_mvlgamma_p_5_xpu_int32",
-        "test_python_ref_torch_fallback__refs_square_xpu_bool",
-        "test_python_ref_torch_fallback__refs_vdot_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_vdot_xpu_complex64",
+
+        # RuntimeError: could not create a primitive descriptor for a deconvolution 
+        # https://github.com/intel/torch-xpu-ops/issues/253
         "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose2d_xpu_float32",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32",
-        "test_variant_consistency_eager_nn_functional_rrelu_xpu_float32",
-        "test_variant_consistency_eager_to_sparse_xpu_complex64",
-        "test_variant_consistency_eager_to_sparse_xpu_float32",
-        "test_compare_cpu__refs_special_zeta_xpu_float32",
+
+        # Need revisit when the op is enabled
+        # Unexpected success, xpu passed because it compares to cpu
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
         "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
-        "test_compare_cpu_special_zeta_xpu_float32",
+        "test_compare_cpu_special_zeta_xpu_float32",  
+
+         # XFAIL of CUDA and XPU, unexpected success in fallback
         "test_out_cholesky_inverse_xpu_float32",
         "test_out_geqrf_xpu_float32",
         "test_out_narrow_copy_xpu_float32",
         "test_out_ormqr_xpu_float32",
-        "test_out_triangular_solve_xpu_float32",
-        "test_python_ref__refs_heaviside_xpu_int64",
-        "test_python_ref__refs_special_bessel_j0_xpu_int64",
-        "test_python_ref_errors__refs_dstack_xpu",
-        "test_python_ref_errors__refs_hstack_xpu",
-        "test_python_ref_errors__refs_linalg_cross_xpu",
-        "test_python_ref_errors__refs_vstack_xpu",
+        
+        # XFAIL of CUDA, XPU got unexpected success
+        "test_python_ref__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref__refs_pow_xpu_complex32",
         "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32",
-        "test_python_ref__refs_special_multigammaln_mvlgamma_p_5_xpu_float64",
-        "test_python_ref_executor__refs_special_multigammaln_mvlgamma_p_3_executor_aten_xpu_float64",
-        "test_python_ref__refs_square_xpu_complex128",
-        "test_python_ref__refs_square_xpu_complex64",
-        "test_python_ref_executor__refs_istft_executor_aten_xpu_complex128",
-        "test_python_ref_executor__refs_square_executor_aten_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_square_xpu_complex128",
-        "test_python_ref_torch_fallback__refs_square_xpu_complex64",
-        # Skip list of new added when porting XPU operators.
-        # See: https://github.com/intel/torch-xpu-ops/issues/128
-        "test_dtypes_view_as_complex_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
-        "test_dtypes_view_as_real_xpu",  # Didn't align with CUDA, The following dtypes did not work in backward but are listed by the OpInfo: {torch.bfloat16}
-        "test_noncontiguous_samples_native_dropout_backward_xpu_int64",  # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Long'.
-        "test_non_standard_bool_values_native_dropout_backward_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "masked_scale" not implemented for 'Bool'.
-        "test_non_standard_bool_values_scatter_reduce_amax_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_scatter_reduce_amin_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_scatter_reduce_prod_xpu_bool",  # Align with CUDA dtypes - "scatter_gather_base_kernel_func" not implemented for 'Bool'
-        "test_non_standard_bool_values_argsort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "argsort" not implemented for 'Bool'.
-        "test_non_standard_bool_values_msort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'.
+        "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref__refs_pow_xpu_complex32",
+        "test_python_ref_executor__refs_mul_executor_aten_xpu_complex32",
+        "test_python_ref_torch_fallback__refs_div_no_rounding_mode_xpu_complex32",
+        "test_python_ref_torch_fallback__refs_pow_xpu_complex32",
+
+
+        # unexpected success because of cpu fallback 
+        "test_out_triangular_solve_xpu_float32",
+
+        # Newly added:
+
+        # Cuda skipped it     
         "test_non_standard_bool_values_sort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "sort" not implemented for 'Bool'.
+
+        # Cuda skipped it
+        "test_non_standard_bool_values_msort_xpu_bool",  # The implementation aligns with CUDA, RuntimeError: "msort" not implemented for 'Bool'.
+
+        
+        # Unexpected success
         "test_python_ref_executor__refs_pow_executor_aten_xpu_complex32",  # Didn't align with CUDA, Unexpected success
+
         # Unexpected success
-        "test_errors_histogramdd_xpu",
+        # "test_errors_histogramdd_xpu", #XFAIL now
         # Jiterator is only supported on CUDA and ROCm GPUs, none are available.
+        # https://github.com/intel/torch-xpu-ops/issues/584
         "_jiterator_",
         # https://github.com/intel/torch-xpu-ops/issues/157
         # Segfault:
@@ -473,9 +422,11 @@
         "test_variant_consistency_eager_triangular_solve_xpu_complex64",
         # oneDNN issues
         # RuntimeError: value cannot be converted to type float without overflow
+        # https://github.com/intel/torch-xpu-ops/issues/683
         "test_conj_view_addbmm_xpu_complex64",
         "test_neg_conj_view_addbmm_xpu_complex128",
         # CPU fallback error: AssertionError: Tensor-likes are not close!
+        # https://github.com/intel/torch-xpu-ops/issues/271
         "test_neg_view_nn_functional_rrelu_xpu_float64",
         ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/254
@@ -693,7 +644,7 @@
         "test_neg_view_nn_functional_conv_transpose2d_xpu_float64",
         "test_neg_view_nn_functional_conv_transpose3d_xpu_float64",
         ### Error #2 in TestMathBitsXPU , NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseXPU' backend.
-        # https://github.com/intel/torch-xpu-ops/issues/242
+        # https://github.com/intel/torch-xpu-ops/issues/242 and https://github.com/intel/torch-xpu-ops/issues/240
         "test_conj_view_to_sparse_xpu_complex64",
         "test_neg_conj_view_to_sparse_xpu_complex128",
         "test_neg_view_to_sparse_xpu_float64",
@@ -703,13 +654,14 @@
         # in XPU supported operators. Then the case will work.
         "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
+
         #AssertionError: The supported dtypes for unique_consecutive on device type xpu are incorrect!
         #The following dtypes worked in forward but are not listed by the OpInfo: {torch.bfloat16}.
-        #XPU supports bfloat16, CUDA doesn't support it.
+        # XPU supports bfloat16, CUDA doesn't support it.
         "test_dtypes_unique_xpu",
         # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
         # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
-        "test_dtypes_polar_xpu",
+        #"test_dtypes_polar_xpu",
         # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
         # but test_dtypes infrastructure leverage CUDA supported datatypes
         "test_dtypes_histogram_xpu",
@@ -718,7 +670,15 @@
         # 1. most cases of nextafter require Half dtype.
         # 2. Half dtype is a common dtype in workloads.
         # So far CUDA doesn't support Half, so that XPU fails as we aligned claimed dtypes with CUDA in test infra.
+        # https://github.com/intel/torch-xpu-ops/issues/623
         "test_dtypes_nextafter_xpu",
+
+        # AssertionError: The supported dtypes for argsort on device type xpu are incorrect!
+        # The following dtypes worked in forward but are not listed by the OpInfo: {torch.bool}.
+        # CUDA does not have torch.bool support on argsort.
+        "test_dtypes_argsort_xpu",
+        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported"
+        "test_errors_histogramdd_xpu",
     ),
 
     "test_binary_ufuncs_xpu.py": (
@@ -1160,9 +1120,9 @@
         # It is kernel assert on XPU implementation not exception on host.
         # We are same as CUDA implementation. And CUDA skips these cases.
         "test_trivial_fancy_out_of_bounds_xpu",
-        "test_advancedindex",
-        # CUDA bias case
-        "test_index_put_accumulate_with_optional_tensors_xpu",
+        # index boundary should be checked. 
+        # https://github.com/intel/torch-xpu-ops/issues/783
+        "test_advancedindex_xpu_float64",
         # XPU implementation doesn't claimn FP8 now
         # https://github.com/intel/torch-xpu-ops/issues/461
         "test_index_put_src_datatype_xpu_float8_e5m2",
@@ -1305,10 +1265,18 @@
         "test_reference_numerics_normal_polygamma_polygamma_n_2_xpu_float16",
         "test_reference_numerics_normal_polygamma_polygamma_n_3_xpu_float16",
         "test_reference_numerics_normal_polygamma_polygamma_n_4_xpu_float16",
+
+        # CUDA XFAIL
+        "test_reference_numerics_large__refs_rsqrt_xpu_complex32",
+
+        # Compiler issue in handling tanh with real or imag inf.
+        # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974
+        "test_reference_numerics_large__refs_tanh_xpu_complex32",
     ),
 
     "test_masked_xpu.py": (
         # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse.
+        # https://github.com/intel/torch-xpu-ops/issues/357
         "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16",
         "test_mask_layout_sparse_coo_masked_amax_xpu_float16",
         "test_mask_layout_sparse_coo_masked_amax_xpu_float32",
@@ -1963,6 +1931,7 @@
 
     "test_maskedtensor_xpu.py": (
         # RuntimeError: is_coalesced expected sparse coordinate tensor layout but got Sparse
+        # https://github.com/intel/torch-xpu-ops/issues/357
         "test_contiguous_xpu",
         "test_invalid_sparse_coo_values_xpu",
         "test_to_dense_and_sparse_coo_xpu",
@@ -2682,6 +2651,7 @@
         "test_autodiff__foreach_sigmoid_outplace_xpu_complex128",
         "test_binary_op_with_scalar_self_support__foreach_pow_is_fastpath_True_xpu_bool",
         # AssertionError: RuntimeError not raised
+        # https://github.com/intel/torch-xpu-ops/issues/784 
         "test_0dim_tensor_overload_exception_xpu",
         # RuntimeError: Tried to instantiate dummy base class CUDAGraph
         "test_big_num_tensors__foreach_max_use_cuda_graph_True_xpu_float32",
diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py
index c3637b0fe..d57567318 100644
--- a/test/xpu/test_indexing_xpu.py
+++ b/test/xpu/test_indexing_xpu.py
@@ -11,6 +11,37 @@
 
 with XPUPatchForImport(False):
     from test_indexing import NumpyTests,TestIndexing
+    import torch
+
+    
+    def __test_index_put_accumulate_with_optional_tensors(self, device):
+        # TODO: replace with a better solution.
+        # Currently, here using torchscript to put None into indices.
+        # on C++ it gives indices as a list of 2 optional tensors: first is null and
+        # the second is a valid tensor.
+        @torch.jit.script
+        def func(x, i, v):
+            idx = [None, i]
+            x.index_put_(idx, v, accumulate=True)
+            return x
+        
+        n = 4
+        t = torch.arange(n * 2, dtype=torch.float32).reshape(n, 2)
+        t_dev = t.to(device)
+        indices = torch.tensor([1, 0])
+        indices_dev = indices.to(device)
+        value0d = torch.tensor(10.0)
+        value1d = torch.tensor([1.0, 2.0])
+
+        out_cuda = func(t_dev, indices_dev, value0d.xpu())
+        out_cpu = func(t, indices, value0d)
+        self.assertEqual(out_cuda.cpu(), out_cpu)
+
+        out_cuda = func(t_dev, indices_dev, value1d.xpu())
+        out_cpu = func(t, indices, value1d)
+        self.assertEqual(out_cuda.cpu(), out_cpu)
+
+    TestIndexing.test_index_put_accumulate_with_optional_tensors = __test_index_put_accumulate_with_optional_tensors
 
 instantiate_device_type_tests(NumpyTests, globals(), only_for=("xpu"), allow_xpu=True)
 
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index 7672c0bb4..01d140858 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -237,6 +237,11 @@
     "nan_to_num",
     "scatter_reduce",
     "nanmean",
+    "native_layer_norm",
+    "native_layer_norm_backward",
+    "square",
+    "heaviside",
+    "argsort",
 ]
 
 _ops_without_cuda_support = [
@@ -415,6 +420,7 @@ def ModuleTest_test_xpu(self, test_case):
             xpu_gradInput = test_case._backward(
                 xpu_module, xpu_input_tuple, xpu_output, xpu_gradOutput
             )
+            
             test_case.assertEqual(
                 cpu_gradInput,
                 xpu_gradInput,
@@ -757,14 +763,13 @@ def gen_xpu_wrappers(op_name, wrappers):
 
     def align_supported_dtypes(self, db):
         for opinfo in db:
-            if (
-                opinfo.name not in _xpu_computation_op_list
-                or opinfo.name in _ops_without_cuda_support
-            ):
+            if ( opinfo.name not in _xpu_computation_op_list and (opinfo.torch_opinfo.name not in _xpu_computation_op_list 
+                if db == common_methods_invocations.python_ref_db else True)) or opinfo.name in _ops_without_cuda_support:
                 opinfo.dtypesIfXPU = opinfo.dtypes
             else:
                 backward_dtypes = set(opinfo.backward_dtypesIfCUDA)
-                backward_dtypes.add(bfloat16)
+                if bfloat16 in opinfo.dtypesIfXPU:
+                    backward_dtypes.add(bfloat16)
                 opinfo.backward_dtypes = tuple(backward_dtypes)
 
             if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
@@ -990,4 +995,4 @@ def launch_test(test_case, skip_list=None, exe_list=None):
             "pytest -v "
             + test_case
         )
-    return os.system(test_command)
\ No newline at end of file
+    return os.system(test_command)