intel · fengyuan14 · Jul 14, 2024 · Jul 11, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/xpu/sycl/ActivationHardswishKernels.h>
 #include <ATen/native/xpu/sycl/ActivationHardtanhKernels.h>
 #include <ATen/native/xpu/sycl/ActivationLeakyReluKernels.h>
+#include <ATen/native/xpu/sycl/ActivationMishKernels.h>
 #include <ATen/native/xpu/sycl/ActivationSiluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationSoftplusKernels.h>
 #include <ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h>
@@ -632,4 +633,32 @@ Tensor& XPUNativeFunctions::softshrink_backward_out(
   return grad_input;
 }
 
+Tensor XPUNativeFunctions::mish(const Tensor& self) {
+  Tensor out;
+  auto iter = TensorIterator::unary_op(out, self);
+  native::xpu::mish_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::mish_out(const Tensor& self, Tensor& out) {
+  auto iter = TensorIterator::unary_op(out, self);
+  native::xpu::mish_kernel(iter);
+  return out;
+}
+
+Tensor& XPUNativeFunctions::mish_(Tensor& self) {
+  auto iter = TensorIterator::unary_op(self, self);
+  native::xpu::mish_kernel(iter);
+  return self;
+}
+
+Tensor XPUNativeFunctions::mish_backward(
+    const Tensor& grad_output,
+    const Tensor& input) {
+  Tensor grad_input = at::empty({0}, input.options());
+  auto iter = TensorIterator::binary_op(grad_input, grad_output, input);
+  native::xpu::mish_backward_kernel(iter);
+  return grad_input;
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -276,7 +276,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "max_unpool2d",
     "max_unpool3d",
     "median",
-    "mish.out",
     "mode",
     "multilabel_margin_loss_backward",
     "multilabel_margin_loss_forward",

diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp b/src/ATen/native/xpu/sycl/ActivationMishKernels.cpp
@@ -0,0 +1,56 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/TensorIterator.h>
+
+#include <ATen/native/xpu/sycl/Loops.h>
+#include <comm/XPUMathCompat.h>
+
+namespace at::native::xpu {
+
+template <typename scalar_t>
+struct MishFunctor {
+  scalar_t operator()(scalar_t x) const {
+    using opmath_t = at::opmath_type<scalar_t>;
+    const opmath_t x_acc = static_cast<opmath_t>(x);
+    return x_acc *
+        c10::xpu::compat::tanh(
+               c10::xpu::compat::log1p(c10::xpu::compat::exp(x_acc)));
+  }
+};
+
+void mish_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_xpu",
+      [&]() { gpu_kernel(iter, MishFunctor<scalar_t>()); });
+}
+
+template <typename scalar_t>
+struct MishBackwardFunctor {
+  scalar_t operator()(scalar_t dy, scalar_t x) const {
+    using opmath_t = at::opmath_type<scalar_t>;
+    const opmath_t dy_acc = static_cast<opmath_t>(dy);
+    const opmath_t x_acc = static_cast<opmath_t>(x);
+    const opmath_t s_acc =
+        opmath_t(1) / (opmath_t(1) + c10::xpu::compat::exp(-x_acc));
+    const opmath_t t_acc = c10::xpu::compat::tanh(
+        c10::xpu::compat::log1p(c10::xpu::compat::exp(x_acc)));
+    return dy_acc * (t_acc + x_acc * s_acc * (opmath_t(1) - t_acc * t_acc));
+  }
+};
+
+void mish_backward_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_backward_xpu",
+      [&]() { gpu_kernel(iter, MishBackwardFunctor<scalar_t>()); });
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ActivationMishKernels.h b/src/ATen/native/xpu/sycl/ActivationMishKernels.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native::xpu {
+
+void mish_kernel(TensorIteratorBase& iter);
+
+void mish_backward_kernel(TensorIteratorBase& iter);
+
+} // namespace at::native::xpu
diff --git a/src/comm/XPUMathCompat.h b/src/comm/XPUMathCompat.h
@@ -29,6 +29,14 @@ __MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
   return sycl::rsqrt(x);
 }
 
+__MATH_FUNCTIONS_DECL__ float log1p(float x) {
+  return ::log1pf(x);
+}
+
+__MATH_FUNCTIONS_DECL__ double log1p(double x) {
+  return ::log1p(x);
+}
+
 // To walk around SYCL compiler optimization on data type promotion.
 // c10::Half gets data type promotion in +-*/ operations. See
 // c10/util/Half-inl.h. XPU implementation gets worse precision on half div,

diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
@@ -61,6 +61,7 @@
     "gt",
     "hardtanh",
     "hardswish",
+    "nn.functional.mish",
     "index_add",
     "index_put",
     "index_select",

diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
@@ -118,6 +118,10 @@ supported:
   - softshrink.out
   - softshrink_backward
   - softshrink_backward.grad_input
+  - mish
+  - mish.out
+  - mish_
+  - mish_backward
   - gelu
   - gelu_
   - gelu.out