diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index f0620c530..4a34e70d1 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -29,7 +29,7 @@ Tensor adaptive_avg_pool2d_backward_xpu(
       (input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
 
-  globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward");
+  globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_xpu");
 
   Tensor grad_input;
   if (input.numel() != 0) {
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index ee8c37ac0..aec707193 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -30,6 +30,7 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu)
  std::optional<double> scales_h,
  std::optional<double> scales_w,
  const Tensor& grad_input) {
+  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_xpu");
   xpu::upsample_bilinear2d_backward_out_kernel(
       grad_input,
       grad_output,
diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
index 57ac0d114..fb034f988 100644
--- a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -531,6 +531,8 @@ Tensor embedding_bag_backward_xpu_max(
     const Tensor& max_indices_t,
     int64_t num_weights,
     int64_t padding_idx) {
+  globalContext().alertNotDeterministic("embedding_bag_backward_xpu_max");
+
   auto max_indices = max_indices_t.contiguous();
   auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());
   int64_t stride = grad_weight.stride(0);
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index d429ecfbe..bcbd50c42 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -207,7 +207,7 @@ void index_select_kernel(
         }),
         AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
         AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
-	AT_EXPAND(AT_FLOAT8_TYPES),
+        AT_EXPAND(AT_FLOAT8_TYPES),
         kComplexHalf,
         kHalf,
         kBool,
@@ -1081,7 +1081,8 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) {
             canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
             "take_xpu_index",
             [&] {
-              const scalar_t* indexed_ptr = input.template const_data_ptr<scalar_t>();
+              const scalar_t* indexed_ptr =
+                  input.template const_data_ptr<scalar_t>();
               TakeFunctor<scalar_t, index_t> f(indexed_ptr);
               take_put_kernel_template<scalar_t, index_t>(iter, input, f);
             });
@@ -1114,6 +1115,14 @@ void put_kernel(
     TensorIterator& iter,
     const TensorBase& output,
     const bool accumulate) {
+  // Nondeterministic when index contains duplicate entries and we do not
+  // accumulate If we accumulate on GPU, we use atomicGPUAdd, which is
+  // non-deterministic
+  if (!accumulate ||
+      (accumulate && iter.tensor(1).device().type() == DeviceType::XPU)) {
+    at::globalContext().alertNotDeterministic("put_");
+  }
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::BFloat16,
       at::ScalarType::Half,
diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
index 9d26a48c7..3dd44968d 100644
--- a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -1248,7 +1248,7 @@ Tensor ctc_loss_backward_kernel(
     bool zero_infinity) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
-  globalContext().alertNotDeterministic("ctc_loss_backward_kernel");
+  globalContext().alertNotDeterministic("ctc_loss_backward_xpu");
   return AT_DISPATCH_FLOATING_TYPES(
       log_probs.scalar_type(), "ctc_loss_backward_xpu", [&] {
         if (targets.scalar_type() == kLong) {
diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
index 4b93cb3c3..8b018de6b 100644
--- a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
+++ b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -186,7 +186,7 @@ void nll_loss2d_forward_kernel(
     int64_t reduction,
     int64_t ignore_index) {
   if (reduction != at::Reduction::None) {
-    at::globalContext().alertNotDeterministic("nll_loss2d_forward_kernel");
+    at::globalContext().alertNotDeterministic("nll_loss2d_forward_xpu");
   }
 
   total_weight.resize_({});
diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
index 7c3aa7f8e..e1903f871 100644
--- a/test/xpu/skip_list_common.py
+++ b/test/xpu/skip_list_common.py
@@ -2,30 +2,43 @@
     "test_ops_xpu.py": (
         # Skip list of base line
 
-        # Need to revisit when the ops are enabled
-        # AssertionError: The supported dtypes for xxx on device type xpu are incorrect!
+        # To be removed from this file.
+        # CUDA and XPU both XFAIL now.
+        "test_out_narrow_copy_xpu_float32", 
+        # This case is marked as skip but XPU failed. However, CUDA and XPU throw the same runtime error.
+        "test_out_histc_xpu_float32",
+
+        # AssertionError: The supported dtypes for __rmod__ on device type xpu are incorrect!
+        # The following dtypes worked in forward but are not listed by the OpInfo: {torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8}.
         "test_dtypes___rmod___xpu",
+        
+        # Data type is not supported in oneDNN!
         "test_dtypes_nn_functional_conv1d_xpu",
         "test_dtypes_nn_functional_conv2d_xpu",
         "test_dtypes_nn_functional_conv3d_xpu",
         "test_dtypes_nn_functional_conv_transpose1d_xpu",
         "test_dtypes_nn_functional_conv_transpose2d_xpu",
         "test_dtypes_nn_functional_conv_transpose3d_xpu",
+
+        # AssertionError: The supported dtypes for nn.functional.softsign on device type xpu are incorrect!
         "test_dtypes_nn_functional_softsign_xpu",
+
+        # AssertionError: The supported dtypes for sparse.sampled_addmm on device type xpu are incorrect! - OPs not supported
         "test_dtypes_sparse_sampled_addmm_xpu",
-        # AssertionError: RuntimeError not raised
+        
+        # OPs not supported
         "test_errors_dot_xpu",
-        "test_errors_kthvalue_xpu",
         "test_errors_vdot_xpu",
-        # Fallback cases with skipCPUIfNoLapack, AssertionError: Tensor-likes are not close!
+
+        # Linalg OPs not supported
         "test_noncontiguous_samples_linalg_det_xpu_float32",
         "test_noncontiguous_samples_linalg_slogdet_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_ex_xpu_float32",
         "test_noncontiguous_samples_linalg_solve_xpu_float32",
         "test_noncontiguous_samples_linalg_tensorsolve_xpu_float32",
         "test_noncontiguous_samples_logdet_xpu_float32",
-        "test_noncontiguous_samples_nn_functional_conv3d_xpu_complex64",
 
+        # Sparse CSR OPs not supported
         # RuntimeError: device type of values (xpu) must be CPU or CUDA or Meta
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_compare_cpu_sparse_sampled_addmm_xpu_float32",
@@ -51,6 +64,7 @@
         "test_noncontiguous_samples_nn_functional_conv1d_xpu_int64",
         "test_noncontiguous_samples_nn_functional_conv2d_xpu_int64",
 
+        # Linalg OPs not supported
         # RuntimeError: mode only supports CPU AND CUDA device type, got: xpu
         # Issue https://github.com/intel/torch-xpu-ops/issues/327
         "test_numpy_ref_linalg_tensorinv_xpu_float64",
@@ -62,19 +76,20 @@
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_complex64",
         "test_variant_consistency_eager_nn_functional_conv_transpose3d_xpu_float32",
 
-        # Need revisit when the op is enabled
-        # Unexpected success, xpu passed because it compares to cpu
+        # Linalg OPs not supported
         "test_compare_cpu_linalg_lu_factor_ex_xpu_float32",
         "test_compare_cpu_linalg_lu_factor_xpu_float32",
         "test_compare_cpu_linalg_lu_xpu_float32",
+
+        # XPU hang. CUDA hang as well. 
+        # https://github.com/pytorch/pytorch/issues/79528
         "test_compare_cpu_special_hermite_polynomial_h_xpu_float32",
 
-         # XFAIL of CUDA and XPU, unexpected success in fallback
+        # XFAIL of CUDA and XPU, unexpected success in fallback
+        # Linalg OPs not supported
         "test_out_cholesky_inverse_xpu_float32",
         "test_out_geqrf_xpu_float32",
-        "test_out_narrow_copy_xpu_float32",
         "test_out_ormqr_xpu_float32",
-        "test_out_histc_xpu_float32",
         
         # XFAIL of CUDA, XPU got unexpected success
         "test_python_ref__refs_div_no_rounding_mode_xpu_complex32",
@@ -87,6 +102,7 @@
         "test_python_ref_torch_fallback__refs_pow_xpu_complex32",
 
         # unexpected success because of cpu fallback
+        # Linalg OPs not supported
         "test_out_triangular_solve_xpu_float32",
 
         # Newly added:
@@ -107,15 +123,17 @@
         "_jiterator_",
         # https://github.com/intel/torch-xpu-ops/issues/157
         # Segfault:
-        "test_dtypes_nn_functional_linear_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
         "test_dtypes_nn_functional_multi_head_attention_forward_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
+
+        # Linalg OPs not supported
         "test_dtypes_pca_lowrank_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
         "test_dtypes_svd_lowrank_xpu",  # https://github.com/intel/torch-xpu-ops/issues/157
+
+        # RuntimeError: Long is not supported in oneDNN!
         "test_noncontiguous_samples_nn_functional_linear_xpu_int64",  # https://github.com/intel/torch-xpu-ops/issues/157
+
         # https://github.com/intel/torch-xpu-ops/issues/157
-        # Failures:
-        "test_compare_cpu_addmm_xpu_float32",
-        "test_compare_cpu_addmv_xpu_float32",
+        # Datatype not supported in oneDNN
         "test_dtypes_addmm_decomposed_xpu",
         "test_dtypes_addmm_xpu",
         "test_dtypes_addmv_xpu",
@@ -395,11 +413,13 @@
         "test_variant_consistency_eager_svd_xpu_complex64",
         "test_variant_consistency_eager_tensordot_xpu_complex64",
         "test_variant_consistency_eager_triangular_solve_xpu_complex64",
+
         # oneDNN issues
         # RuntimeError: value cannot be converted to type float without overflow
         # https://github.com/intel/torch-xpu-ops/issues/683
         "test_conj_view_addbmm_xpu_complex64",
         "test_neg_conj_view_addbmm_xpu_complex128",
+
         ### Error #0 in TestMathBitsXPU , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/254
         "test_conj_view___rmatmul___xpu_complex64",
@@ -609,32 +629,26 @@
         "test_conj_view_svd_lowrank_xpu_complex64",
         "test_neg_conj_view_pca_lowrank_xpu_complex128",
         "test_neg_conj_view_svd_lowrank_xpu_complex128",
+        
+        # oneDNN issues
         ### Error #1 in TestMathBitsXPU , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         # https://github.com/intel/torch-xpu-ops/issues/253
         "test_conj_view_nn_functional_conv_transpose2d_xpu_complex64",
         "test_conj_view_nn_functional_conv_transpose3d_xpu_complex64",
         "test_neg_view_nn_functional_conv_transpose2d_xpu_float64",
         "test_neg_view_nn_functional_conv_transpose3d_xpu_float64",
-        # Op impl aligns with CUDA on the supported dtypes.
-        # RuntimeError: "avg_pool2d_xpu" not implemented for 'Long'.
-        # Retrieve the case, once avg_pool1d is supported. Test infra will change claimed dtypes in test case once the op is listed
-        # in XPU supported operators. Then the case will work.
-        "test_noncontiguous_samples_nn_functional_avg_pool1d_xpu_int64",
-        "test_noncontiguous_samples_nn_functional_local_response_norm_xpu_int64",
-
-        # RuntimeError: Expected both inputs to be Half, Float or Double tensors but got BFloat16 and BFloat16.
-        # Polar's backward is calculated using complex(), which does not support bfloat16. CUDA fails with same error.
-        #"test_dtypes_polar_xpu",
+
         # implemented aten::histogram to align MPS operators coverage, CUDA doesn't support
         # but test_dtypes infrastructure leverage CUDA supported datatypes
         "test_dtypes_histogram_xpu",
 
-        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported"
+        # Unexpected success, CUDA got XFAIL because CUDA does not have historgramadd supported
         "test_errors_histogramdd_xpu",
 
         # 2025 bundle std::pow complex result is different on host and device
         "test_python_ref__refs_square_xpu_complex64",
         "test_python_ref_torch_fallback__refs_square_xpu_complex64",
+        "test_python_ref_torch_fallback__refs_exp_xpu_complex128",
     ),
 
     "test_binary_ufuncs_xpu.py": (
@@ -661,7 +675,7 @@
 
     "test_autograd_fallback_xpu.py": None,
 
-    "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",),  # Hard code CUDA
+    "test_sort_and_select_xpu.py": ("test_sort_large_slice_xpu",),  # Hard code CUDA, UT has already been rewritten to test/regressions/test_sort.py.
 
     "nn/test_embedding_xpu.py": (
         # NotImplementedError: Could not run 'aten::_indices' with arguments from the 'SparseXPU' backend.
@@ -713,8 +727,12 @@
         "test_disable_fastpath_xpu",
         # We have no mechanism to handle SDPBackend::ERROR so far. Will give a fully support when we support all SDPBackends.
         "test_dispatch_fails_no_backend_xpu",
+
+        # NestedTensorXPU not supported
         # Could not run 'aten::_to_copy' with arguments from the 'NestedTensorXPU' backend
         "test_with_nested_tensor_input_xpu",
+
+        # oneDNN issues
         # Double and complex datatype matmul is not supported in oneDNN
         # https://github.com/intel/torch-xpu-ops/issues/253
         "test_sdp_math_gradcheck_contiguous_inputs_False_xpu",
@@ -920,12 +938,18 @@
         "test_cpu_gpu_parity_nn_ConvTranspose2d_xpu_complex32",
         # CPU fallback fails
         # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+        
+        # aten::_thnn_fused_gru_cell not support XPU backend
         "test_save_load_nn_GRU_eval_mode_xpu_float32",
         "test_save_load_nn_GRUCell_xpu_float32",
         "test_save_load_nn_GRU_train_mode_xpu_float32",
+        
+        # aten::_thnn_fused_lstm_cell not support XPU backend
         # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend.
         "_LSTM_",
         "_LSTMCell_",
+
+        # aten::_thnn_fused_gru_cell not support XPU backend
         # CPU fallback fails
         # Could not run 'aten::_thnn_fused_gru_cell' with arguments from the 'CPU' backend.
         "test_to_nn_GRUCell_swap_True_set_grad_False_xpu_float32",
@@ -991,6 +1015,7 @@
         "test_type",
         # rnn fallback to cpu
         "test_cudnn_weight_format",
+        # oneDNN issues
         # AssertionError: MultiheadAttention does not support NestedTensor outside of its fast path. The fast path was not hit because some Tensor argument's device is neither one of cpu, cuda or privateuseone
         "test_TransformerEncoderLayer_empty_xpu",
         "test_transformerencoderlayer_xpu_float16",
@@ -1015,12 +1040,8 @@
         "test_rnn_retain_variables_xpu_float64",
         "test_transformerencoderlayer_xpu_float64",
         "test_variable_sequence_xpu_float64",
-        # AssertionError: RuntimeError not raised
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bicubic_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_3_mode_bilinear_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bicubic_uint8_xpu_uint8",
-        "test_upsamplingBiMode2d_nonsupported_dtypes_antialias_True_num_channels_5_mode_bilinear_uint8_xpu_uint8",
-        # upsamplingNearest2d: Failed: Unexpected success
+        # Unexpected success: CUDA only test case, launch grid_y == 2**16 (larger than CUDA maximum y-dimension limit 65535) and expect fail.
+        # SYCL don't have this limitation and hence can pass.
         "test_upsamplingNearest2d_launch_fail_xpu",
         # Could not run 'aten::_thnn_fused_lstm_cell' with arguments from the 'CPU' backend.
         "test_RNN_cudnn_weight_norm",
@@ -1040,13 +1061,6 @@
     ),
 
     "test_indexing_xpu.py": (
-        # CPU bias cases
-        # It is kernel assert on XPU implementation not exception on host.
-        # We are same as CUDA implementation. And CUDA skips these cases.
-        "test_trivial_fancy_out_of_bounds_xpu",
-        # index boundary should be checked. 
-        # https://github.com/intel/torch-xpu-ops/issues/783
-        "test_advancedindex_xpu_float64",
         # XPU implementation doesn't claimn FP8 now
         # https://github.com/intel/torch-xpu-ops/issues/461
         "test_index_put_src_datatype_xpu_float8_e5m2",
@@ -1116,8 +1130,6 @@
     "test_unary_ufuncs_xpu.py": (
         # AssertionError: Jiterator is only supported on CUDA and ROCm GPUs, none are available.
         "_jiterator_",
-        # CPU Fallback fails: Tensor-likes are not close!
-        "test_reference_numerics_large_tanh_xpu_complex32",
         # For extreme value processing, Numpy and XPU results are inconsistent
         # std operations get different behavior on std::complex operarands for extremal cases
         "test_reference_numerics_extremal__refs_log_xpu_complex64",
@@ -1158,7 +1170,7 @@
         # Greatest relative difference: 1.9145216356264427e-05 at index (463, 204) (up to 1.3e-06 allowed)
         "test_reference_numerics_normal__refs_asinh_xpu_complex64",
         "test_reference_numerics_normal_asinh_xpu_complex64",
-        # Failed: Unexpected success
+        # Unexpected success: CUDA uses thrust::sqrt and has accuracy issue. XPU use std::sqrt and has no issue.
         "test_reference_numerics_large_rsqrt_xpu_complex32",
         # Numeric difference
         # https://github.com/intel/torch-xpu-ops/issues/544
@@ -1178,10 +1190,6 @@
         # CUDA XFAIL
         "test_reference_numerics_large__refs_rsqrt_xpu_complex32",
 
-        # Compiler issue in handling tanh with real or imag inf.
-        # https://github.com/intel/torch-xpu-ops/issues/184, https://jira.devtools.intel.com/browse/CMPLRLIBS-34974
-        "test_reference_numerics_large__refs_tanh_xpu_complex32",
-
         # 2025 bundle std::pow complex result is different on host and device
         "test_exp_xpu_complex64",
         "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
@@ -1193,6 +1201,8 @@
     ),
 
     "test_masked_xpu.py": (
+        # Summary: Sparse CSR for XPU is not supported
+
         # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend.
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_mask_layout_sparse_coo_masked_amax_xpu_bfloat16",
@@ -1329,6 +1339,9 @@
     "nn/test_lazy_modules_xpu.py": None,
 
     "test_linalg_xpu.py": (
+        # Summary:
+        # All linear algebra related ops are not supported for XPU.
+
         # _convert_weight_to_int4pack not support
         "_int4_mm_m_",
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
@@ -1580,6 +1593,8 @@
     ),
 
     "test_ops_fwd_gradients_xpu.py": (
+        # All of the followings are oneDNN issues
+
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         "test_fn_fwgrad_bwgrad___rmatmul___xpu_complex128",
         "test_fn_fwgrad_bwgrad___rmatmul___xpu_float64",
@@ -1884,6 +1899,8 @@
     ),
 
     "test_maskedtensor_xpu.py": (
+        # Summary: SparseCsrXPU OPs are not supported
+
         # NotImplementedError: Could not run 'aten::_to_sparse_csr' with arguments from the 'SparseXPU' backend.
         # https://github.com/intel/torch-xpu-ops/issues/357
         "test_to_dense_xpu",
@@ -1987,13 +2004,12 @@
         # ACTUAL: array([-1.108163e+12,  1.108163e+12], dtype=float32)
         # DESIRED: array([-1.108163e+12,  1.090847e+12], dtype=float32)
         "test_fq_module_per_tensor_xpu",
-        # AssertionError: False is not true : Expected dScale=tensor([-0.0173], device='xpu:0') to match scale.grad=tensor([0.0189], device='xpu:0')
-        "test_learnable_backward_per_channel_cuda_xpu",
     ),
 
     "quantization/core/test_workflow_module_xpu.py": None,
 
     "quantization/core/test_quantized_tensor_xpu.py": (
+        # Summary: Quantized OPs are not supported for XPU
         # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend
         "test_compare_per_channel_device_numerics_xpu",
         # NotImplementedError: Could not run 'aten::dequantize.self' with arguments from the 'QuantizedXPU' backend.
@@ -2022,6 +2038,8 @@
     ),
 
     "test_ops_gradients_xpu.py": (
+        # All are oneDNN issues
+
         ### Error #0 in TestBwdGradientsXPU , totally 271 , RuntimeError: Double and complex datatype matmul is not supported in oneDNN
         "test_fn_grad___rmatmul___xpu_complex128",
         "test_fn_grad___rmatmul___xpu_float64",
@@ -2297,11 +2315,13 @@
         "test_fn_gradgrad_pca_lowrank_xpu_complex128",
         "test_fn_gradgrad_svd_lowrank_xpu_complex128",
         "test_fn_grad_linalg_norm_xpu_complex128",
+
         ### Error #1 in TestBwdGradientsXPU , totally 4 , RuntimeError: value cannot be converted to type float without overflow
         "test_fn_grad_addbmm_xpu_complex128",
         "test_fn_gradgrad_addbmm_xpu_complex128",
         "test_inplace_grad_addbmm_xpu_complex128",
         "test_inplace_gradgrad_addbmm_xpu_complex128",
+
         ### Error #4 in TestBwdGradientsXPU , totally 8 , RuntimeError: could not create a primitive descriptor for a deconvolution forward propagation primitive
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_complex128",
         "test_fn_grad_nn_functional_conv_transpose2d_xpu_float64",
@@ -2322,6 +2342,7 @@
     ),
 
     "test_torch_xpu.py": (
+        # 'torch.xpu' has no attribute ...
         ### Error #1 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'FloatTensor'
         "test_grad_scaling_state_dict_xpu",
         ### Error #2 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'torch.storage.TypedStorage' object has no attribute 'is_xpu'
@@ -2331,6 +2352,7 @@
         ### Error #4 in TestTorchDeviceTypeXPU , totally 4 , AttributeError: module 'torch.xpu' has no attribute 'FloatStorage'
         "test_storage_setitem_xpu_float32",
         "test_tensor_storage_type_xpu_float32",
+
         ### Error #7 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map2_ is only implemented on CPU tensors
         "test_broadcast_fn_map2_xpu",
         ### Error #8 in TestTorchDeviceTypeXPU , totally 1 , TypeError: map_ is only implemented on CPU tensors
@@ -2346,16 +2368,8 @@
         "test_sync_warning_xpu",
         ### Error #19 in TestTorchDeviceTypeXPU , totally 1 , RuntimeError: _share_fd_: only available on CPU
         "test_module_share_memory_xpu",
-        ### Error #23 in TestTorchDeviceTypeXPU , totally 26 , AssertionError: RuntimeError not raised : expected a non-deterministic error, but it was not raised
-        "test_nondeterministic_alert_AdaptiveAvgPool2d_xpu",
-        "test_nondeterministic_alert_CTCLoss_xpu",
-        "test_nondeterministic_alert_EmbeddingBag_max_xpu",
-        "test_nondeterministic_alert_MaxPool3d_xpu",
-        "test_nondeterministic_alert_NLLLoss_xpu",
-        "test_nondeterministic_alert_interpolate_bilinear_xpu",
-        "test_nondeterministic_alert_put_accumulate_xpu",
-        ### Error #24 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: 'TestTorchDeviceTypeXPU' object has no attribute 'check_device_nondeterministic_alert'
-        "test_nondeterministic_alert_AvgPool3d_xpu",
+
+        # 'torch.xpu' has no attribute ...
         ### Error #30 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'BoolStorage'
         "test_storage_setitem_xpu_bool",
         "test_tensor_storage_type_xpu_bool",
@@ -2384,11 +2398,7 @@
         "test_tensor_storage_type_xpu_bfloat16",
         ### Error #39 in TestTorchDeviceTypeXPU , totally 1 , AttributeError: module 'torch.xpu' has no attribute 'HalfStorage'
         "test_tensor_storage_type_xpu_float16",
-        ### Error #40 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_index_add - RuntimeError: expected ...
-        "test_tensor_storage_type_xpu_uint8",
-        ### Error #41 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_print - AttributeError: module 'tor...
-        "test_tensor_storage_type_xpu_uint8",
-        ### Error #42 in TestTorchDeviceTypeXPU , totally 1 , FAILED test_torch_xpu.py::TestTorch::test_storage_error - AttributeError: 'to...
+        ### Module 'torch.xpu' has no attribute 'ByteStorage'
         "test_tensor_storage_type_xpu_uint8",
         # issue 302 , 8
         "test_print",
@@ -2420,6 +2430,7 @@
     ),
 
     "test_native_mha_xpu.py": (
+        # NestedTensorXPU related OPs
         # NotImplementedError: Could not run 'aten::_native_multi_head_attention' with arguments from the 'NestedTensorXPU' backend.
         "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float16",
         "test_native_multihead_self_attention_use_nt_False_use_padding_True_pad_all_False_need_weights_False_average_attn_weights_False_fused_False_xpu_float32",
@@ -2476,6 +2487,7 @@
     ),
 
     "nn/test_convolution_xpu.py": (
+        # Summary: all of them are oneDNN related issues
         # XPU unsupport ops, skip.
         # https://github.com/intel/torch-xpu-ops/issues/348
         "test_cudnn_convolution_relu_xpu_float16",
@@ -2507,7 +2519,6 @@
         # https://github.com/intel/torch-xpu-ops/issues/774
         "_jiterator_",
 
-
         # RuntimeError: Short is not supported in oneDNN! Need oneDNN's support, suggest to keep skip.
         "test_dispatch_meta_outplace_nn_functional_linear_xpu_int16",
         "test_dispatch_symbolic_meta_outplace_nn_functional_linear_xpu_int16",
@@ -2519,7 +2530,6 @@
         "test_meta_outplace_nn_functional_linear_xpu_int64",
 
         # RuntimeError: Double and complex datatype matmul is not supported in oneDNN
-
         "test_dispatch_meta_inplace_addbmm_xpu_complex",
         "test_dispatch_meta_outplace_addbmm_xpu_complex",
         "test_dispatch_symbolic_meta_inplace_addbmm_xpu_complex",
diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py
index d659197d9..2e39ca90d 100644
--- a/test/xpu/test_decomp_xpu.py
+++ b/test/xpu/test_decomp_xpu.py
@@ -39,6 +39,7 @@ def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
         (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2,
+        (torch.float16, torch.ops.aten.nll_loss2d_backward.default): 1e-4,
         (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1,
         (torch.float16, torch.ops.aten.hardswish.default): 2e-7,
         (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7,
diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py
index d57567318..b4299789e 100644
--- a/test/xpu/test_indexing_xpu.py
+++ b/test/xpu/test_indexing_xpu.py
@@ -13,6 +13,7 @@
     from test_indexing import NumpyTests,TestIndexing
     import torch
 
+    torch.Tensor.is_cuda = torch.Tensor.is_xpu
     
     def __test_index_put_accumulate_with_optional_tensors(self, device):
         # TODO: replace with a better solution.
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
index 9c54ffdcc..8dce5989c 100644
--- a/test/xpu/test_torch_xpu.py
+++ b/test/xpu/test_torch_xpu.py
@@ -1439,8 +1439,10 @@ def test_nondeterministic_alert_AvgPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        self.check_device_nondeterministic_alert(grad, 'avg_pool3d_backward')
-
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'avg_pool3d_backward_' + torch.device(device).type,
+            torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1478,7 +1480,7 @@ def test_nondeterministic_alert_MaxPool3d(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'max_pool3d_with_indices_backward' + torch.device(device).type,
+            'max_pool3d_with_indices_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
@@ -1770,10 +1772,9 @@ def test_nondeterministic_alert_NLLLoss(self, device):
         input = torch.randn(2, 3, 5, 5, device=device)
         target = torch.rand(2, 5, 5, device=device).mul(3).floor().long()
 
-
         self.check_nondeterministic_alert(
             lambda: module(input, target),
-            'nll_loss2d_forward_out_' + torch.device(device).type + '_template',
+            'nll_loss2d_forward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1788,7 +1789,7 @@ def test_nondeterministic_alert_CTCLoss(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'ctc_loss_backward_gpu',
+            'ctc_loss_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")