Refine skip list in test_ops_xpu.py (#1126)

Includes: - Refine skip list of `test_ops_xpu.py` & `test_decomp_xpu.py` - Refine none-deterministic operators alert list --------- Co-authored-by: fengqing.lu <[email protected]>
intel · Dec 4, 2024 · ea6d3a6 · ea6d3a6
1 parent be810b5
commit ea6d3a6
Show file tree

Hide file tree

Showing 10 changed files with 102 additions and 77 deletions.
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -29,7 +29,7 @@ Tensor adaptive_avg_pool2d_backward_xpu(
       (input.ndimension() == 3 || input.ndimension() == 4),
       "non-empty 3D or 4D (batch mode) tensor expected for input");
 
-  globalContext().alertNotDeterministic("_adaptive_avg_pool2d_backward");
+  globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_xpu");
 
   Tensor grad_input;
   if (input.numel() != 0) {

diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -30,6 +30,7 @@ TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_xpu)
  std::optional<double> scales_h,
  std::optional<double> scales_w,
  const Tensor& grad_input) {
+  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_xpu");
   xpu::upsample_bilinear2d_backward_out_kernel(
       grad_input,
       grad_output,

diff --git a/src/ATen/native/xpu/sycl/EmbeddingBag.cpp b/src/ATen/native/xpu/sycl/EmbeddingBag.cpp
@@ -531,6 +531,8 @@ Tensor embedding_bag_backward_xpu_max(
     const Tensor& max_indices_t,
     int64_t num_weights,
     int64_t padding_idx) {
+  globalContext().alertNotDeterministic("embedding_bag_backward_xpu_max");
+
   auto max_indices = max_indices_t.contiguous();
   auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());
   int64_t stride = grad_weight.stride(0);

diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -207,7 +207,7 @@ void index_select_kernel(
         }),
         AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
         AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
-	AT_EXPAND(AT_FLOAT8_TYPES),
+        AT_EXPAND(AT_FLOAT8_TYPES),
         kComplexHalf,
         kHalf,
         kBool,
@@ -1081,7 +1081,8 @@ void take_kernel(TensorIterator& iter, const TensorBase& input) {
             canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
             "take_xpu_index",
             [&] {
-              const scalar_t* indexed_ptr = input.template const_data_ptr<scalar_t>();
+              const scalar_t* indexed_ptr =
+                  input.template const_data_ptr<scalar_t>();
               TakeFunctor<scalar_t, index_t> f(indexed_ptr);
               take_put_kernel_template<scalar_t, index_t>(iter, input, f);
             });
@@ -1114,6 +1115,14 @@ void put_kernel(
     TensorIterator& iter,
     const TensorBase& output,
     const bool accumulate) {
+  // Nondeterministic when index contains duplicate entries and we do not
+  // accumulate If we accumulate on GPU, we use atomicGPUAdd, which is
+  // non-deterministic
+  if (!accumulate ||
+      (accumulate && iter.tensor(1).device().type() == DeviceType::XPU)) {
+    at::globalContext().alertNotDeterministic("put_");
+  }
+
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::BFloat16,
       at::ScalarType::Half,

diff --git a/src/ATen/native/xpu/sycl/LossCTCKernels.cpp b/src/ATen/native/xpu/sycl/LossCTCKernels.cpp
@@ -1248,7 +1248,7 @@ Tensor ctc_loss_backward_kernel(
     bool zero_infinity) {
   // See Note [Writing Nondeterministic Operations]
   // Nondeterministic because of atomicAdd usage
-  globalContext().alertNotDeterministic("ctc_loss_backward_kernel");
+  globalContext().alertNotDeterministic("ctc_loss_backward_xpu");
   return AT_DISPATCH_FLOATING_TYPES(
       log_probs.scalar_type(), "ctc_loss_backward_xpu", [&] {
         if (targets.scalar_type() == kLong) {

diff --git a/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp b/src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
@@ -186,7 +186,7 @@ void nll_loss2d_forward_kernel(
     int64_t reduction,
     int64_t ignore_index) {
   if (reduction != at::Reduction::None) {
-    at::globalContext().alertNotDeterministic("nll_loss2d_forward_kernel");
+    at::globalContext().alertNotDeterministic("nll_loss2d_forward_xpu");
   }
 
   total_weight.resize_({});

diff --git a/test/xpu/skip_list_common.py b/test/xpu/skip_list_common.py
diff --git a/test/xpu/test_decomp_xpu.py b/test/xpu/test_decomp_xpu.py
@@ -39,6 +39,7 @@ def _op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
         (torch.float16, torch.ops.aten.nll_loss2d_forward.default): 1e-2,
+        (torch.float16, torch.ops.aten.nll_loss2d_backward.default): 1e-4,
         (torch.bfloat16, torch.ops.aten.nll_loss2d_forward.default): 2e-1,
         (torch.float16, torch.ops.aten.hardswish.default): 2e-7,
         (torch.bfloat16, torch.ops.aten.hardswish.default): 2e-7,

diff --git a/test/xpu/test_indexing_xpu.py b/test/xpu/test_indexing_xpu.py
@@ -13,6 +13,7 @@
     from test_indexing import NumpyTests,TestIndexing
     import torch
 
+    torch.Tensor.is_cuda = torch.Tensor.is_xpu
 
     def __test_index_put_accumulate_with_optional_tensors(self, device):
         # TODO: replace with a better solution.

diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
@@ -1439,8 +1439,10 @@ def test_nondeterministic_alert_AvgPool3d(self, device):
         res = module(input)
         grad = torch.ones_like(res)
 
-        self.check_device_nondeterministic_alert(grad, 'avg_pool3d_backward')
-
+        self.check_nondeterministic_alert(
+            lambda: res.backward(grad, retain_graph=True),
+            'avg_pool3d_backward_' + torch.device(device).type,
+            torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1478,7 +1480,7 @@ def test_nondeterministic_alert_MaxPool3d(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'max_pool3d_with_indices_backward' + torch.device(device).type,
+            'max_pool3d_with_indices_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfMPS
@@ -1770,10 +1772,9 @@ def test_nondeterministic_alert_NLLLoss(self, device):
         input = torch.randn(2, 3, 5, 5, device=device)
         target = torch.rand(2, 5, 5, device=device).mul(3).floor().long()
 
-
         self.check_nondeterministic_alert(
             lambda: module(input, target),
-            'nll_loss2d_forward_out_' + torch.device(device).type + '_template',
+            'nll_loss2d_forward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1788,7 +1789,7 @@ def test_nondeterministic_alert_CTCLoss(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'ctc_loss_backward_gpu',
+            'ctc_loss_backward_' + torch.device(device).type,
             torch.device(device).type == 'cuda' or torch.device(device).type == 'xpu')
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")