Skip to content

Commit

Permalink
Add aten::erfinv, aten::exp2, aten::expm1, aten::exponential_ (#527)
Browse files Browse the repository at this point in the history
- erfinv
  - erfinv_
  - erfinv.out
  - exp2
  - exp2_
  - exp2.out
  - expm1
  - expm1_
  - expm1.out
  - exponential_

---------

Co-authored-by: Feng Yuan <[email protected]>
  • Loading branch information
hjhee and fengyuan14 authored Jul 20, 2024
1 parent b8888da commit 2258cb4
Show file tree
Hide file tree
Showing 14 changed files with 328 additions and 14 deletions.
18 changes: 18 additions & 0 deletions src/ATen/native/xpu/Distributions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,24 @@ Tensor& XPUNativeFunctions::random_(
return random_(self, 0, to, std::move(generator));
}

template <typename RNG>
struct ExponentialStub {
void operator()(
TensorIteratorBase& iter,
double lambda,
c10::optional<Generator> gen) {
native::xpu::exponential_kernel(iter, lambda, gen);
}
};

Tensor& XPUNativeFunctions::exponential_(
Tensor& self,
double lambda,
std::optional<Generator> generator) {
return native::templates::exponential_impl_<ExponentialStub, Generator>(
self, lambda, std::move(generator));
}

/* The largest consecutive integer representable in float32 (2^24) */
constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24);

Expand Down
66 changes: 66 additions & 0 deletions src/ATen/native/xpu/UnaryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,72 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
return out;
}

Tensor XPUNativeFunctions::erfinv(const Tensor& self) {
Tensor out;
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::erfinv_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::erfinv_(Tensor& self) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(self, self);
native::xpu::erfinv_kernel(iter);
return self;
}

Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::erfinv_kernel(iter);
return out;
}

Tensor XPUNativeFunctions::exp2(const Tensor& self) {
Tensor out;
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::exp2_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::exp2_(Tensor& self) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(self, self);
native::xpu::exp2_kernel(iter);
return self;
}

Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::exp2_kernel(iter);
return out;
}

Tensor XPUNativeFunctions::expm1(const Tensor& self) {
Tensor out;
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::expm1_kernel(iter);
return iter.output();
}

Tensor& XPUNativeFunctions::expm1_(Tensor& self) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(self, self);
native::xpu::expm1_kernel(iter);
return self;
}

Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) {
TensorIterator iter;
iter.build_borrowing_unary_float_op(out, self);
native::xpu::expm1_kernel(iter);
return out;
}

Tensor XPUNativeFunctions::frac(const Tensor& self) {
Tensor out;
TensorIterator iter;
Expand Down
4 changes: 0 additions & 4 deletions src/ATen/native/xpu/XPUFallback.template
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
"_efficient_attention_forward",
"_embedding_bag_dense_backward",
"_embedding_bag_per_sample_weights_backward",
"erfinv.out",
"exp2.out",
"expm1.out",
"exponential_",
"_fft_c2c",
"_fft_c2r",
"_fft_r2c",
Expand Down
21 changes: 21 additions & 0 deletions src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/xpu/sycl/DistributionTemplates.h>
#include <ATen/native/xpu/sycl/Philox4x32.h>
#include <ATen/xpu/XPUGeneratorImpl.h>
#include <comm/DeviceProperties.h>
#include <comm/Runtime.h>

namespace at::native::xpu {

void exponential_kernel(
TensorIteratorBase& iter,
double lambda,
c10::optional<Generator> gen) {
auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
gen, at::xpu::detail::getDefaultXPUGenerator());
at::native::templates::xpu::exponential_kernel(iter, lambda, generator);
}

} // namespace at::native::xpu
5 changes: 5 additions & 0 deletions src/ATen/native/xpu/sycl/DistributionKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,9 @@ void bernoulli_scalar_kernel(
double p,
c10::optional<Generator> gen);

void exponential_kernel(
TensorIteratorBase& iter,
double lambda,
c10::optional<Generator> gen);

} // namespace at::native::xpu
45 changes: 45 additions & 0 deletions src/ATen/native/xpu/sycl/DistributionTemplates.h
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,51 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
});
}

// ====================== Exponential ======================

template <typename scalar_t, typename accscalar_t>
struct ExponentialFunctor {
auto operator()(accscalar_t val) const {
// BEFORE TOUCHING THIS CODE READ:
// https://github.com/pytorch/pytorch/issues/16706
// rand_uniform has (0,1] bounds. log(1) is 0 and exponential
// excludes 0. we need log to be not 0, and not underflow when
// converted to half
accscalar_t log;
if (val >= static_cast<accscalar_t>(1.f) -
std::numeric_limits<scalar_t>::epsilon() / 2.f) {
log = -std::numeric_limits<scalar_t>::epsilon() / 2.f;
} else {
log = std::log(val);
}
return static_cast<accscalar_t>(-1.f) / lambd_ * log;
}
ExponentialFunctor(accscalar_t lambd) : lambd_(lambd) {}

private:
accscalar_t lambd_;
};

template <typename RNG>
void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG gen) {
TORCH_CHECK(
isFloatingType(iter.dtype()),
"Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ",
iter.dtype());
AT_DISPATCH_FLOATING_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
iter.dtype(),
"exponential__xpu_",
[&] {
using accscalar_t = at::acc_type<scalar_t, true>;
auto lambd = static_cast<accscalar_t>(lambda);
ExponentialFunctor<scalar_t, accscalar_t> exponential_func(lambd);
uniform_and_transform<scalar_t, accscalar_t, rand4_engine_calls>(
iter, gen, exponential_func);
});
}

} // namespace xpu
} // namespace templates
} // namespace native
Expand Down
26 changes: 26 additions & 0 deletions src/ATen/native/xpu/sycl/UnaryKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,4 +224,30 @@ void nan_to_num_kernel(
}
}

template <typename scalar_t>
struct Expm1Functor {
scalar_t operator()(scalar_t a) const {
return std::expm1(a);
}
};

template <typename T>
struct Expm1Functor<c10::complex<T>> {
c10::complex<T> operator()(c10::complex<T> x) const {
auto a = std::sin(.5 * x.imag());
auto re = std::expm1(x.real()) * std::cos(x.imag()) - 2 * a * a;
auto im = std::exp(x.real()) * std::sin(x.imag());
return c10::complex<T>(re, im);
}
};

void expm1_kernel(TensorIteratorBase& iter) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
iter.common_dtype(),
"expm1_xpu",
[&]() { gpu_kernel(iter, Expm1Functor<scalar_t>()); });
}

} // namespace at::native::xpu
2 changes: 2 additions & 0 deletions src/ATen/native/xpu/sycl/UnaryKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ void bitwise_not_kernel(TensorIteratorBase& iter);

void exp_kernel(TensorIteratorBase& iter);

void expm1_kernel(TensorIteratorBase& iter);

void nan_to_num_kernel(
TensorIteratorBase& iter,
std::optional<double> nan,
Expand Down
100 changes: 100 additions & 0 deletions src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,106 @@ void erfc_kernel(TensorIteratorBase& iter) {
[&]() { gpu_kernel(iter, ErfcFunctor<scalar_t>()); });
}

template <typename scalar_t>
struct ErfinvFunctor {
using opmath_type = at::opmath_type<scalar_t>;

scalar_t operator()(scalar_t in) const {
scalar_t out;
opmath_type z, num, dem;

auto x = static_cast<opmath_type>(in);
if (std::fabs(x) > 1.0f) {
out = static_cast<scalar_t>(NAN);
return out;
}
if (std::fabs(x) == 1.0f) {
out = static_cast<scalar_t>(
(std::copysign(1.0, static_cast<double>(x))) *
(std::numeric_limits<double>::infinity()));
return out;
}
if (std::fabs(x) <= 0.7f) {
z = x * x;
num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]);
dem =
((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z +
static_cast<opmath_type>(1.0));
out = x * num / dem;
} else {
z = static_cast<opmath_type>(
std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0)));
num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0];
dem = (d_[1] * z + d_[0]) * z + static_cast<opmath_type>(1.0);
out = static_cast<scalar_t>(
static_cast<opmath_type>(std::copysign(1.0, static_cast<double>(x))) *
num / dem);
}
out = out -
static_cast<scalar_t>(
(std::erf(static_cast<double>(out)) - x) /
((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
out = out -
static_cast<scalar_t>(
(std::erf(static_cast<double>(out)) - x) /
((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
return out;
}

static constexpr double PI_f64_ = 3.14159265358979323846;
static constexpr std::array<opmath_type, 4> a_ = {
0.886226899,
-1.645349621,
0.914624893,
-0.140543331};
static constexpr std::array<opmath_type, 4> b_ = {
-2.118377725,
1.442710462,
-0.329097515,
0.012229801};
static constexpr std::array<opmath_type, 4> c_ = {
-1.970840454,
-1.624906493,
3.429567803,
1.641345311};
static constexpr std::array<opmath_type, 2> d_ = {3.543889200, 1.637067800};
};

void erfinv_kernel(TensorIteratorBase& iter) {
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::Half,
ScalarType::BFloat16,
iter.common_dtype(),
"erfinv_xpu",
[&]() { gpu_kernel(iter, ErfinvFunctor<scalar_t>()); });
}

template <typename scalar_t>
struct Exp2Functor {
scalar_t operator()(scalar_t a) const {
return std::exp2(a);
}
};

template <typename T>
struct Exp2Functor<c10::complex<T>> {
c10::complex<T> operator()(c10::complex<T> x) const {
// There is no std::exp2 overload for complex, so instead
// use the identity 2^x = e^(ln(2) * x)
const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
return std::exp(ln_2 * x);
}
};

void exp2_kernel(TensorIteratorBase& iter) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
iter.common_dtype(),
"exp2_xpu",
[&]() { gpu_kernel(iter, Exp2Functor<scalar_t>()); });
}

template <typename scalar_t>
struct Logit0Functor {
using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
Expand Down
4 changes: 4 additions & 0 deletions src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ void erf_kernel(TensorIteratorBase& iter);

void erfc_kernel(TensorIteratorBase& iter);

void erfinv_kernel(TensorIteratorBase& iter);

void exp2_kernel(TensorIteratorBase& iter);

void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);

} // namespace at::native::xpu
10 changes: 6 additions & 4 deletions test/xpu/extended/run_test_with_skip.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# a. Different kernel implementations.
# b. Different std functions. (std::log, std::tanh, std::exp)
# 5. The result of division between two same float values is not 1.
# 6. std functions get different results when input is nan or inf between GCC and SYCL.
"test_compare_cpu_cumsum_xpu_bfloat16",
"test_compare_cpu_cumsum_xpu_float16",
"test_compare_cpu_log_xpu_complex64",
Expand All @@ -34,17 +35,18 @@
"test_compare_cpu_cross_xpu_float16",
"test_compare_cpu_floor_divide_xpu_bfloat16",
"test_compare_cpu_floor_divide_xpu_float16",

# got inconsistent values between CPU / XPU
# AssertionError: Tensor-likes are not close!
# compute results contain nan / inf
"test_compare_cpu_exp_xpu_bfloat16",
"test_compare_cpu_exp_xpu_complex128",
"test_compare_cpu_exp_xpu_complex64",
"test_compare_cpu_acosh_xpu_complex64",
"test_compare_cpu_asin_xpu_complex128",
"test_compare_cpu_asin_xpu_complex64",
"test_compare_cpu_asinh_xpu_complex128",
"test_compare_cpu_asinh_xpu_complex64",
"test_compare_cpu_atan_xpu_complex128",
"test_compare_cpu_atan_xpu_complex64",
"test_compare_cpu_exp2_xpu_complex128",
"test_compare_cpu_exp2_xpu_complex64",

# skip random failure due to accuracy
# AssertionError: Tensor-likes are not close!
Expand Down
Loading

0 comments on commit 2258cb4

Please sign in to comment.