Merge branch 'main' of https://github.com/intel/torch-xpu-ops into ma…

…jing/histogram
intel · Jul 22, 2024 · c7f7ccb · c7f7ccb
2 parents 967cba8 + e9516e6
commit c7f7ccb
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 62 deletions.
diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp
@@ -68,4 +68,33 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> XPUNativeFunctions::
       include_last_offset,
       padding_idx);
 }
+
+Tensor XPUNativeFunctions::_embedding_bag_backward(
+    const Tensor& grad,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& offset2bag,
+    const Tensor& bag_size,
+    const Tensor& maximum_indices,
+    int64_t num_weights,
+    bool scale_grad_by_freq,
+    int64_t mode,
+    bool sparse,
+    const c10::optional<Tensor>& per_sample_weights,
+    int64_t padding_idx) {
+  return at::native::_embedding_bag_backward_symint(
+      grad,
+      indices,
+      offsets,
+      offset2bag,
+      bag_size,
+      maximum_indices,
+      num_weights,
+      scale_grad_by_freq,
+      mode,
+      sparse,
+      per_sample_weights,
+      padding_idx);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/sycl/DilatedMaxPool2d.cpp
@@ -416,68 +416,78 @@ void launch_max_pool2d_backward_kernel(
     int dilation_h,
     int dilation_w) {
   auto& queue = at::xpu::getCurrentSYCLQueue();
-  int64_t gradOutputSize =
-      numBatch * numPlane * gradOutputSizeH * gradOutputSizeW;
   int64_t gradInputSize = numBatch * numPlane * gradInputSizeH * gradInputSizeW;
   auto out_cf_c_stride = gradOutputSizeH * gradOutputSizeW;
   auto in_cf_c_stride = gradInputSizeH * gradInputSizeW;
   auto out_n_stride = numPlane * out_cf_c_stride;
   auto in_n_stride = numPlane * in_cf_c_stride;
-  if (globalContext().deterministicAlgorithms() ||
-      std::is_same_v<scalar_t, at::Half> ||
-      std::is_same_v<scalar_t, at::BFloat16>) {
-    using KernelClass =
-        MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
-    BatchKernelConfig cfg = {
-        1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    cfg.template build<KernelClass>();
-    auto kfn = KernelClass(
-        gradInput,
-        gradOutput,
-        indices,
-        numPlane,
-        gradInputSizeH,
-        gradInputSizeW,
-        gradOutputSizeH,
-        gradOutputSizeW,
-        gradInputSize,
-        out_cf_c_stride,
-        in_cf_c_stride,
-        out_n_stride,
-        in_n_stride,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        dilation_h,
-        dilation_w,
-        cfg);
-    sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
-  } else {
-    using KernelClass =
-        MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
-    BatchKernelConfig cfg = {
-        1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
-    cfg.template build<KernelClass>();
-    auto kfn = KernelClass(
-        gradInput,
-        gradOutput,
-        indices,
-        numPlane,
-        gradInputSizeH,
-        gradInputSizeW,
-        gradOutputSizeH,
-        gradOutputSizeW,
-        gradOutputSize,
-        out_cf_c_stride,
-        in_cf_c_stride,
-        out_n_stride,
-        in_n_stride,
-        cfg);
-    sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
-  }
+
+#ifndef XPU_ALLOW_UNDETERMINISTIC
+  // [Deterministic Note]
+
+  // By default, we disable the un-derterministic path in this kernel,
+  // so that we make sure there will no side-effect with the accuracy.
+  // In the future, we will re-enable the un-deterministic path to improve
+  // performance.
+  //
+  // The background of this is that we found this kernel has different behavior
+  // with CUDA in alexnet To avoid future problem, we decided to always use
+  // deterministic path.
+
+  using KernelClass =
+      MaxPool2dBackwardDeterministicKernelFunctor<scalar_t, is_channels_last>;
+  BatchKernelConfig cfg = {
+      1, gradInputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  cfg.template build<KernelClass>();
+  auto kfn = KernelClass(
+      gradInput,
+      gradOutput,
+      indices,
+      numPlane,
+      gradInputSizeH,
+      gradInputSizeW,
+      gradOutputSizeH,
+      gradOutputSizeW,
+      gradInputSize,
+      out_cf_c_stride,
+      in_cf_c_stride,
+      out_n_stride,
+      in_n_stride,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      cfg);
+  sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
+#else
+  int64_t gradOutputSize =
+      numBatch * numPlane * gradOutputSizeH * gradOutputSizeW;
+  using KernelClass =
+      MaxPool2dBackwardKernelFunctor<scalar_t, is_channels_last>;
+  BatchKernelConfig cfg = {
+      1, gradOutputSize, 1, 1, true, BatchKernelConfig::Policy::pAdaptive};
+  cfg.template build<KernelClass>();
+  auto kfn = KernelClass(
+      gradInput,
+      gradOutput,
+      indices,
+      numPlane,
+      gradInputSizeH,
+      gradInputSizeW,
+      gradOutputSizeH,
+      gradOutputSizeW,
+      gradOutputSize,
+      out_cf_c_stride,
+      in_cf_c_stride,
+      out_n_stride,
+      in_n_stride,
+      cfg);
+  sycl_kernel_submit(cfg.global_size(), cfg.group_size(), queue, kfn);
+#endif
 }
 
 void max_pool2d_with_indices_kernel(

diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
@@ -103,11 +103,6 @@
     "test_compare_cpu_nn_functional_embedding_bag_xpu_float64",
     "test_view_replay_nn_functional_embedding_bag_xpu_float32",
 
-    # Not implemented operators, aten::_embedding_bag_backward.
-    # To retrieve cases when the operators are supported.
-    # https://github.com/intel/torch-xpu-ops/issues/536
-    "test_backward_nn_functional_embedding_bag_xpu_float32",
-
     #Double and complex datatype matmul is not supported in oneDNN
     "test_compare_cpu_cdist_xpu_float64",
 

diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
@@ -529,6 +529,7 @@ supported:
   - bincount
   - _embedding_bag
   - _embedding_bag_forward_only
+  - _embedding_bag_backward
   - sgn
   - sgn.out
   - sgn_