Add aten::erfinv, aten::exp2, aten::expm1, aten::exponential_ (#527)

- erfinv - erfinv_ - erfinv.out - exp2 - exp2_ - exp2.out - expm1 - expm1_ - expm1.out - exponential_ --------- Co-authored-by: Feng Yuan <[email protected]>
intel · Jul 20, 2024 · 2258cb4 · 2258cb4
1 parent b8888da
commit 2258cb4
Show file tree

Hide file tree

Showing 14 changed files with 328 additions and 14 deletions.
diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp
@@ -191,6 +191,24 @@ Tensor& XPUNativeFunctions::random_(
   return random_(self, 0, to, std::move(generator));
 }
 
+template <typename RNG>
+struct ExponentialStub {
+  void operator()(
+      TensorIteratorBase& iter,
+      double lambda,
+      c10::optional<Generator> gen) {
+    native::xpu::exponential_kernel(iter, lambda, gen);
+  }
+};
+
+Tensor& XPUNativeFunctions::exponential_(
+    Tensor& self,
+    double lambda,
+    std::optional<Generator> generator) {
+  return native::templates::exponential_impl_<ExponentialStub, Generator>(
+      self, lambda, std::move(generator));
+}
+
 /* The largest consecutive integer representable in float32 (2^24) */
 constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24);
 

diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
@@ -613,6 +613,72 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor XPUNativeFunctions::erfinv(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::erfinv_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::erfinv_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::erfinv_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::erfinv_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::exp2(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::exp2_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::exp2_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::exp2_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::exp2_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::expm1(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::expm1_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::expm1_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::expm1_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::expm1_kernel(iter);
+  return out;
+}
+
 Tensor XPUNativeFunctions::frac(const Tensor& self) {
   Tensor out;
   TensorIterator iter;

diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -179,10 +179,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_efficient_attention_forward",
     "_embedding_bag_dense_backward",
     "_embedding_bag_per_sample_weights_backward",
-    "erfinv.out",
-    "exp2.out",
-    "expm1.out",
-    "exponential_",
     "_fft_c2c",
     "_fft_c2r",
     "_fft_r2c",

diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
@@ -0,0 +1,21 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/DistributionTemplates.h>
+#include <ATen/native/xpu/sycl/Philox4x32.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <comm/DeviceProperties.h>
+#include <comm/Runtime.h>
+
+namespace at::native::xpu {
+
+void exponential_kernel(
+    TensorIteratorBase& iter,
+    double lambda,
+    c10::optional<Generator> gen) {
+  auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
+      gen, at::xpu::detail::getDefaultXPUGenerator());
+  at::native::templates::xpu::exponential_kernel(iter, lambda, generator);
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h
@@ -38,4 +38,9 @@ void bernoulli_scalar_kernel(
     double p,
     c10::optional<Generator> gen);
 
+void exponential_kernel(
+    TensorIteratorBase& iter,
+    double lambda,
+    c10::optional<Generator> gen);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h
@@ -591,6 +591,51 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
       });
 }
 
+// ====================== Exponential ======================
+
+template <typename scalar_t, typename accscalar_t>
+struct ExponentialFunctor {
+  auto operator()(accscalar_t val) const {
+    // BEFORE TOUCHING THIS CODE READ:
+    // https://github.com/pytorch/pytorch/issues/16706
+    // rand_uniform has (0,1] bounds. log(1) is 0 and exponential
+    // excludes 0. we need log to be not 0, and not underflow when
+    // converted to half
+    accscalar_t log;
+    if (val >= static_cast<accscalar_t>(1.f) -
+            std::numeric_limits<scalar_t>::epsilon() / 2.f) {
+      log = -std::numeric_limits<scalar_t>::epsilon() / 2.f;
+    } else {
+      log = std::log(val);
+    }
+    return static_cast<accscalar_t>(-1.f) / lambd_ * log;
+  }
+  ExponentialFunctor(accscalar_t lambd) : lambd_(lambd) {}
+
+ private:
+  accscalar_t lambd_;
+};
+
+template <typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG gen) {
+  TORCH_CHECK(
+      isFloatingType(iter.dtype()),
+      "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ",
+      iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "exponential__xpu_",
+      [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+        auto lambd = static_cast<accscalar_t>(lambda);
+        ExponentialFunctor<scalar_t, accscalar_t> exponential_func(lambd);
+        uniform_and_transform<scalar_t, accscalar_t, rand4_engine_calls>(
+            iter, gen, exponential_func);
+      });
+}
+
 } // namespace xpu
 } // namespace templates
 } // namespace native

diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -224,4 +224,30 @@ void nan_to_num_kernel(
   }
 }
 
+template <typename scalar_t>
+struct Expm1Functor {
+  scalar_t operator()(scalar_t a) const {
+    return std::expm1(a);
+  }
+};
+
+template <typename T>
+struct Expm1Functor<c10::complex<T>> {
+  c10::complex<T> operator()(c10::complex<T> x) const {
+    auto a = std::sin(.5 * x.imag());
+    auto re = std::expm1(x.real()) * std::cos(x.imag()) - 2 * a * a;
+    auto im = std::exp(x.real()) * std::sin(x.imag());
+    return c10::complex<T>(re, im);
+  }
+};
+
+void expm1_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "expm1_xpu",
+      [&]() { gpu_kernel(iter, Expm1Functor<scalar_t>()); });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h
@@ -12,6 +12,8 @@ void bitwise_not_kernel(TensorIteratorBase& iter);
 
 void exp_kernel(TensorIteratorBase& iter);
 
+void expm1_kernel(TensorIteratorBase& iter);
+
 void nan_to_num_kernel(
     TensorIteratorBase& iter,
     std::optional<double> nan,

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -77,6 +77,106 @@ void erfc_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, ErfcFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct ErfinvFunctor {
+  using opmath_type = at::opmath_type<scalar_t>;
+
+  scalar_t operator()(scalar_t in) const {
+    scalar_t out;
+    opmath_type z, num, dem;
+
+    auto x = static_cast<opmath_type>(in);
+    if (std::fabs(x) > 1.0f) {
+      out = static_cast<scalar_t>(NAN);
+      return out;
+    }
+    if (std::fabs(x) == 1.0f) {
+      out = static_cast<scalar_t>(
+          (std::copysign(1.0, static_cast<double>(x))) *
+          (std::numeric_limits<double>::infinity()));
+      return out;
+    }
+    if (std::fabs(x) <= 0.7f) {
+      z = x * x;
+      num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]);
+      dem =
+          ((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z +
+           static_cast<opmath_type>(1.0));
+      out = x * num / dem;
+    } else {
+      z = static_cast<opmath_type>(
+          std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0)));
+      num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0];
+      dem = (d_[1] * z + d_[0]) * z + static_cast<opmath_type>(1.0);
+      out = static_cast<scalar_t>(
+          static_cast<opmath_type>(std::copysign(1.0, static_cast<double>(x))) *
+          num / dem);
+    }
+    out = out -
+        static_cast<scalar_t>(
+              (std::erf(static_cast<double>(out)) - x) /
+              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
+    out = out -
+        static_cast<scalar_t>(
+              (std::erf(static_cast<double>(out)) - x) /
+              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
+    return out;
+  }
+
+  static constexpr double PI_f64_ = 3.14159265358979323846;
+  static constexpr std::array<opmath_type, 4> a_ = {
+      0.886226899,
+      -1.645349621,
+      0.914624893,
+      -0.140543331};
+  static constexpr std::array<opmath_type, 4> b_ = {
+      -2.118377725,
+      1.442710462,
+      -0.329097515,
+      0.012229801};
+  static constexpr std::array<opmath_type, 4> c_ = {
+      -1.970840454,
+      -1.624906493,
+      3.429567803,
+      1.641345311};
+  static constexpr std::array<opmath_type, 2> d_ = {3.543889200, 1.637067800};
+};
+
+void erfinv_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      iter.common_dtype(),
+      "erfinv_xpu",
+      [&]() { gpu_kernel(iter, ErfinvFunctor<scalar_t>()); });
+}
+
+template <typename scalar_t>
+struct Exp2Functor {
+  scalar_t operator()(scalar_t a) const {
+    return std::exp2(a);
+  }
+};
+
+template <typename T>
+struct Exp2Functor<c10::complex<T>> {
+  c10::complex<T> operator()(c10::complex<T> x) const {
+    // There is no std::exp2 overload for complex, so instead
+    // use the identity 2^x = e^(ln(2) * x)
+    const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
+    return std::exp(ln_2 * x);
+  }
+};
+
+void exp2_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "exp2_xpu",
+      [&]() { gpu_kernel(iter, Exp2Functor<scalar_t>()); });
+}
+
 template <typename scalar_t>
 struct Logit0Functor {
   using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;

diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
@@ -10,6 +10,10 @@ void erf_kernel(TensorIteratorBase& iter);
 
 void erfc_kernel(TensorIteratorBase& iter);
 
+void erfinv_kernel(TensorIteratorBase& iter);
+
+void exp2_kernel(TensorIteratorBase& iter);
+
 void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
@@ -10,6 +10,7 @@
     #     a. Different kernel implementations.
     #     b. Different std functions. (std::log, std::tanh, std::exp)
     # 5. The result of division between two same float values is not 1.
+    # 6. std functions get different results when input is nan or inf between GCC and SYCL.
     "test_compare_cpu_cumsum_xpu_bfloat16",
     "test_compare_cpu_cumsum_xpu_float16",
     "test_compare_cpu_log_xpu_complex64",
@@ -34,17 +35,18 @@
     "test_compare_cpu_cross_xpu_float16",
     "test_compare_cpu_floor_divide_xpu_bfloat16",
     "test_compare_cpu_floor_divide_xpu_float16",
-
-    # got inconsistent values between CPU / XPU
-    # AssertionError: Tensor-likes are not close!
-    # compute results contain nan / inf
+    "test_compare_cpu_exp_xpu_bfloat16",
+    "test_compare_cpu_exp_xpu_complex128",
+    "test_compare_cpu_exp_xpu_complex64",
     "test_compare_cpu_acosh_xpu_complex64",
     "test_compare_cpu_asin_xpu_complex128",
     "test_compare_cpu_asin_xpu_complex64",
     "test_compare_cpu_asinh_xpu_complex128",
     "test_compare_cpu_asinh_xpu_complex64",
     "test_compare_cpu_atan_xpu_complex128",
     "test_compare_cpu_atan_xpu_complex64",
+    "test_compare_cpu_exp2_xpu_complex128",
+    "test_compare_cpu_exp2_xpu_complex64",
 
     # skip random failure due to accuracy
     # AssertionError: Tensor-likes are not close!