Merge branch 'main' into mengfeil/weekly

intel · Jul 29, 2024 · 2f31d86 · 2f31d86
2 parents de5e34f + 0608225
commit 2f31d86
Show file tree

Hide file tree

Showing 21 changed files with 1,323 additions and 134 deletions.
diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
@@ -1,10 +1,11 @@
 #include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/core/op_registration/adaption.h>
-#include <ATen/xpu/XPUNativeFunctions.h>
-
 #include <ATen/native/xpu/sycl/IndexingKernels.h>
+#include <ATen/xpu/XPUNativeFunctions.h>
 #include <comm/TensorInfo.h>
 
 namespace at {
@@ -44,4 +45,53 @@ Tensor XPUNativeFunctions::index_select(
   return index_select_out(self, dim, index, out);
 }
 
+static Tensor& masked_select_out_impl(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mask) {
+  NoNamesGuard guard;
+
+  TORCH_CHECK(
+      mask.scalar_type() == ScalarType::Bool,
+      "masked_select: expected BoolTensor for mask");
+  TORCH_CHECK(
+      self.scalar_type() == result.scalar_type(),
+      "masked_select(): self and result must have the same scalar type");
+
+  auto mask_temp = (mask.dim() == 0)
+      ? c10::MaybeOwned<Tensor>::owned(mask.unsqueeze(0))
+      : c10::MaybeOwned<Tensor>::borrowed(mask);
+  auto self_temp = (self.dim() == 0)
+      ? c10::MaybeOwned<Tensor>::owned(self.unsqueeze(0))
+      : c10::MaybeOwned<Tensor>::borrowed(self);
+
+  // Cannot reassign to mask_temp and self_temp here! if they are
+  // owning and expand_outplace returns a borrow, the returned borrow
+  // would dangle.
+  auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
+  XPUNativeFunctions::index_out(
+      *std::get<1>(mask_self_expanded),
+      c10::List<std::optional<at::Tensor>>(
+          {*std::move(std::get<0>(mask_self_expanded))}),
+      result);
+
+  return result;
+}
+
+Tensor XPUNativeFunctions::masked_select(
+    const Tensor& self,
+    const Tensor& mask) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  Tensor result = at::empty({0}, self.options());
+  return masked_select_out_impl(result, self, mask);
+}
+
+Tensor& XPUNativeFunctions::masked_select_out(
+    const Tensor& self,
+    const Tensor& mask,
+    Tensor& result) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  return masked_select_out_impl(result, self, mask);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/NMS.cpp b/src/ATen/native/xpu/NMS.cpp
@@ -48,14 +48,15 @@ Tensor nms(const Tensor& dets, const Tensor& scores, double iou_threshold_) {
   auto mask = nms_kernel(dets_sorted, iou_threshold);
 
   at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr();
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.mutable_data_ptr();
 
   std::vector<unsigned long long> remv(col_blocks);
   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
 
   at::Tensor keep =
       at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = (int64_t*)keep.data_ptr();
+  int64_t* keep_out = keep.mutable_data_ptr<int64_t>();
 
   int num_to_keep = 0;
   for (int i = 0; i < dets_num; i++) {

diff --git a/src/ATen/native/xpu/ReduceOps.cpp b/src/ATen/native/xpu/ReduceOps.cpp
@@ -869,6 +869,40 @@ Tensor XPUNativeFunctions::amin(
   return out;
 }
 
+Tensor& XPUNativeFunctions::nansum_out(
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype,
+    Tensor& result) {
+  // For integral types, use existing sum as
+  // integral types don't have `Nan`.
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return at::sum_out(result, self, dim, keepdim, opt_dtype);
+  }
+
+  auto out_dtype = infer_dtype_from_optional(self, opt_dtype, result);
+  result = resize_reduction(result, self, dim, keepdim, out_dtype);
+  auto iter = meta::make_reduction_from_out_ty(
+      self, result, dim, keepdim, result.scalar_type());
+
+  if (iter.numel() == 0) {
+    result = result.zero_();
+  } else {
+    native::xpu::nansum_kernel(iter);
+  }
+  return result;
+}
+
+Tensor XPUNativeFunctions::nansum(
+    const Tensor& self,
+    at::OptionalIntArrayRef dim,
+    bool keepdim,
+    std::optional<ScalarType> opt_dtype) {
+  Tensor result;
+  return XPUNativeFunctions::nansum_out(self, dim, keepdim, opt_dtype, result);
+}
+
 static ScalarType get_result_or_self_value_dtype(
     const Tensor& self,
     const Tensor& result,