From 2258cb4a71460fe36da41cc4faf588cb3e56c72c Mon Sep 17 00:00:00 2001 From: hjhee Date: Sat, 20 Jul 2024 21:08:28 +0800 Subject: [PATCH] Add aten::erfinv, aten::exp2, aten::expm1, aten::exponential_ (#527) - erfinv - erfinv_ - erfinv.out - exp2 - exp2_ - exp2.out - expm1 - expm1_ - expm1.out - exponential_ --------- Co-authored-by: Feng Yuan --- src/ATen/native/xpu/Distributions.cpp | 18 ++++ src/ATen/native/xpu/UnaryOps.cpp | 66 ++++++++++++ src/ATen/native/xpu/XPUFallback.template | 4 - .../sycl/DistributionExponentialKernel.cpp | 21 ++++ .../native/xpu/sycl/DistributionKernels.h | 5 + .../native/xpu/sycl/DistributionTemplates.h | 45 ++++++++ src/ATen/native/xpu/sycl/UnaryKernels.cpp | 26 +++++ src/ATen/native/xpu/sycl/UnaryKernels.h | 2 + .../xpu/sycl/UnarySpecialOpsKernels.cpp | 100 ++++++++++++++++++ .../native/xpu/sycl/UnarySpecialOpsKernels.h | 4 + test/xpu/extended/run_test_with_skip.py | 10 +- test/xpu/run_test_with_skip.py | 26 +++-- test/xpu/xpu_test_utils.py | 5 + yaml/xpu_functions.yaml | 10 ++ 14 files changed, 328 insertions(+), 14 deletions(-) create mode 100644 src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp index a2e89743a..51ff727cc 100644 --- a/src/ATen/native/xpu/Distributions.cpp +++ b/src/ATen/native/xpu/Distributions.cpp @@ -191,6 +191,24 @@ Tensor& XPUNativeFunctions::random_( return random_(self, 0, to, std::move(generator)); } +template +struct ExponentialStub { + void operator()( + TensorIteratorBase& iter, + double lambda, + c10::optional gen) { + native::xpu::exponential_kernel(iter, lambda, gen); + } +}; + +Tensor& XPUNativeFunctions::exponential_( + Tensor& self, + double lambda, + std::optional generator) { + return native::templates::exponential_impl_( + self, lambda, std::move(generator)); +} + /* The largest consecutive integer representable in float32 (2^24) */ constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24); diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp index 1222bfc4f..1f9c8e3c0 100644 --- a/src/ATen/native/xpu/UnaryOps.cpp +++ b/src/ATen/native/xpu/UnaryOps.cpp @@ -613,6 +613,72 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) { return out; } +Tensor XPUNativeFunctions::erfinv(const Tensor& self) { + Tensor out; + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::erfinv_kernel(iter); + return iter.output(); +} + +Tensor& XPUNativeFunctions::erfinv_(Tensor& self) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(self, self); + native::xpu::erfinv_kernel(iter); + return self; +} + +Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::erfinv_kernel(iter); + return out; +} + +Tensor XPUNativeFunctions::exp2(const Tensor& self) { + Tensor out; + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::exp2_kernel(iter); + return iter.output(); +} + +Tensor& XPUNativeFunctions::exp2_(Tensor& self) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(self, self); + native::xpu::exp2_kernel(iter); + return self; +} + +Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::exp2_kernel(iter); + return out; +} + +Tensor XPUNativeFunctions::expm1(const Tensor& self) { + Tensor out; + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::expm1_kernel(iter); + return iter.output(); +} + +Tensor& XPUNativeFunctions::expm1_(Tensor& self) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(self, self); + native::xpu::expm1_kernel(iter); + return self; +} + +Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) { + TensorIterator iter; + iter.build_borrowing_unary_float_op(out, self); + native::xpu::expm1_kernel(iter); + return out; +} + Tensor XPUNativeFunctions::frac(const Tensor& self) { Tensor out; TensorIterator iter; diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template index 3341a8e7c..75274d4cf 100644 --- a/src/ATen/native/xpu/XPUFallback.template +++ b/src/ATen/native/xpu/XPUFallback.template @@ -179,10 +179,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) { "_efficient_attention_forward", "_embedding_bag_dense_backward", "_embedding_bag_per_sample_weights_backward", - "erfinv.out", - "exp2.out", - "expm1.out", - "exponential_", "_fft_c2c", "_fft_c2r", "_fft_r2c", diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp new file mode 100644 index 000000000..4c1b83689 --- /dev/null +++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp @@ -0,0 +1,21 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native::xpu { + +void exponential_kernel( + TensorIteratorBase& iter, + double lambda, + c10::optional gen) { + auto generator = get_generator_or_default( + gen, at::xpu::detail::getDefaultXPUGenerator()); + at::native::templates::xpu::exponential_kernel(iter, lambda, generator); +} + +} // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h index ce1787f51..e5700f26e 100644 --- a/src/ATen/native/xpu/sycl/DistributionKernels.h +++ b/src/ATen/native/xpu/sycl/DistributionKernels.h @@ -38,4 +38,9 @@ void bernoulli_scalar_kernel( double p, c10::optional gen); +void exponential_kernel( + TensorIteratorBase& iter, + double lambda, + c10::optional gen); + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h index e4345bfad..851263e56 100644 --- a/src/ATen/native/xpu/sycl/DistributionTemplates.h +++ b/src/ATen/native/xpu/sycl/DistributionTemplates.h @@ -591,6 +591,51 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) { }); } +// ====================== Exponential ====================== + +template +struct ExponentialFunctor { + auto operator()(accscalar_t val) const { + // BEFORE TOUCHING THIS CODE READ: + // https://github.com/pytorch/pytorch/issues/16706 + // rand_uniform has (0,1] bounds. log(1) is 0 and exponential + // excludes 0. we need log to be not 0, and not underflow when + // converted to half + accscalar_t log; + if (val >= static_cast(1.f) - + std::numeric_limits::epsilon() / 2.f) { + log = -std::numeric_limits::epsilon() / 2.f; + } else { + log = std::log(val); + } + return static_cast(-1.f) / lambd_ * log; + } + ExponentialFunctor(accscalar_t lambd) : lambd_(lambd) {} + + private: + accscalar_t lambd_; +}; + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG gen) { + TORCH_CHECK( + isFloatingType(iter.dtype()), + "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", + iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.dtype(), + "exponential__xpu_", + [&] { + using accscalar_t = at::acc_type; + auto lambd = static_cast(lambda); + ExponentialFunctor exponential_func(lambd); + uniform_and_transform( + iter, gen, exponential_func); + }); +} + } // namespace xpu } // namespace templates } // namespace native diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp index e13572dcb..2ba855d34 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp @@ -224,4 +224,30 @@ void nan_to_num_kernel( } } +template +struct Expm1Functor { + scalar_t operator()(scalar_t a) const { + return std::expm1(a); + } +}; + +template +struct Expm1Functor> { + c10::complex operator()(c10::complex x) const { + auto a = std::sin(.5 * x.imag()); + auto re = std::expm1(x.real()) * std::cos(x.imag()) - 2 * a * a; + auto im = std::exp(x.real()) * std::sin(x.imag()); + return c10::complex(re, im); + } +}; + +void expm1_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.common_dtype(), + "expm1_xpu", + [&]() { gpu_kernel(iter, Expm1Functor()); }); +} + } // namespace at::native::xpu diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h index dab9b2808..cc394de6b 100644 --- a/src/ATen/native/xpu/sycl/UnaryKernels.h +++ b/src/ATen/native/xpu/sycl/UnaryKernels.h @@ -12,6 +12,8 @@ void bitwise_not_kernel(TensorIteratorBase& iter); void exp_kernel(TensorIteratorBase& iter); +void expm1_kernel(TensorIteratorBase& iter); + void nan_to_num_kernel( TensorIteratorBase& iter, std::optional nan, diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp index 586b0f0b1..b7d0b8974 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp @@ -77,6 +77,106 @@ void erfc_kernel(TensorIteratorBase& iter) { [&]() { gpu_kernel(iter, ErfcFunctor()); }); } +template +struct ErfinvFunctor { + using opmath_type = at::opmath_type; + + scalar_t operator()(scalar_t in) const { + scalar_t out; + opmath_type z, num, dem; + + auto x = static_cast(in); + if (std::fabs(x) > 1.0f) { + out = static_cast(NAN); + return out; + } + if (std::fabs(x) == 1.0f) { + out = static_cast( + (std::copysign(1.0, static_cast(x))) * + (std::numeric_limits::infinity())); + return out; + } + if (std::fabs(x) <= 0.7f) { + z = x * x; + num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]); + dem = + ((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z + + static_cast(1.0)); + out = x * num / dem; + } else { + z = static_cast( + std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0))); + num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0]; + dem = (d_[1] * z + d_[0]) * z + static_cast(1.0); + out = static_cast( + static_cast(std::copysign(1.0, static_cast(x))) * + num / dem); + } + out = out - + static_cast( + (std::erf(static_cast(out)) - x) / + ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x))); + out = out - + static_cast( + (std::erf(static_cast(out)) - x) / + ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x))); + return out; + } + + static constexpr double PI_f64_ = 3.14159265358979323846; + static constexpr std::array a_ = { + 0.886226899, + -1.645349621, + 0.914624893, + -0.140543331}; + static constexpr std::array b_ = { + -2.118377725, + 1.442710462, + -0.329097515, + 0.012229801}; + static constexpr std::array c_ = { + -1.970840454, + -1.624906493, + 3.429567803, + 1.641345311}; + static constexpr std::array d_ = {3.543889200, 1.637067800}; +}; + +void erfinv_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + iter.common_dtype(), + "erfinv_xpu", + [&]() { gpu_kernel(iter, ErfinvFunctor()); }); +} + +template +struct Exp2Functor { + scalar_t operator()(scalar_t a) const { + return std::exp2(a); + } +}; + +template +struct Exp2Functor> { + c10::complex operator()(c10::complex x) const { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + const auto ln_2 = static_cast(0.693147180559945309417232121458176); + return std::exp(ln_2 * x); + } +}; + +void exp2_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + iter.common_dtype(), + "exp2_xpu", + [&]() { gpu_kernel(iter, Exp2Functor()); }); +} + template struct Logit0Functor { using T_ACC = acc_type_device; diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h index 447a1d7af..6bed9c6de 100644 --- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h +++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h @@ -10,6 +10,10 @@ void erf_kernel(TensorIteratorBase& iter); void erfc_kernel(TensorIteratorBase& iter); +void erfinv_kernel(TensorIteratorBase& iter); + +void exp2_kernel(TensorIteratorBase& iter); + void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar); } // namespace at::native::xpu diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py index 7318e25fe..d50f14d53 100644 --- a/test/xpu/extended/run_test_with_skip.py +++ b/test/xpu/extended/run_test_with_skip.py @@ -10,6 +10,7 @@ # a. Different kernel implementations. # b. Different std functions. (std::log, std::tanh, std::exp) # 5. The result of division between two same float values is not 1. + # 6. std functions get different results when input is nan or inf between GCC and SYCL. "test_compare_cpu_cumsum_xpu_bfloat16", "test_compare_cpu_cumsum_xpu_float16", "test_compare_cpu_log_xpu_complex64", @@ -34,10 +35,9 @@ "test_compare_cpu_cross_xpu_float16", "test_compare_cpu_floor_divide_xpu_bfloat16", "test_compare_cpu_floor_divide_xpu_float16", - - # got inconsistent values between CPU / XPU - # AssertionError: Tensor-likes are not close! - # compute results contain nan / inf + "test_compare_cpu_exp_xpu_bfloat16", + "test_compare_cpu_exp_xpu_complex128", + "test_compare_cpu_exp_xpu_complex64", "test_compare_cpu_acosh_xpu_complex64", "test_compare_cpu_asin_xpu_complex128", "test_compare_cpu_asin_xpu_complex64", @@ -45,6 +45,8 @@ "test_compare_cpu_asinh_xpu_complex64", "test_compare_cpu_atan_xpu_complex128", "test_compare_cpu_atan_xpu_complex64", + "test_compare_cpu_exp2_xpu_complex128", + "test_compare_cpu_exp2_xpu_complex64", # skip random failure due to accuracy # AssertionError: Tensor-likes are not close! diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py index ca4664bad..7d71d50c1 100644 --- a/test/xpu/run_test_with_skip.py +++ b/test/xpu/run_test_with_skip.py @@ -1464,12 +1464,8 @@ def launch_test(test_case, skip_list=None, exe_list=None): "_jiterator_", # CPU Fallback fails: Tensor-likes are not close! "test_reference_numerics_extremal__refs_acos_xpu_complex128", - "test_reference_numerics_extremal__refs_exp2_xpu_complex128", - "test_reference_numerics_extremal__refs_exp2_xpu_complex64", "test_reference_numerics_extremal__refs_nn_functional_tanhshrink_xpu_complex64", "test_reference_numerics_extremal_acos_xpu_complex128", - "test_reference_numerics_extremal_exp2_xpu_complex128", - "test_reference_numerics_extremal_exp2_xpu_complex64", "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64", "test_reference_numerics_normal__refs_nn_functional_tanhshrink_xpu_complex64", "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64", @@ -1531,6 +1527,26 @@ def launch_test(test_case, skip_list=None, exe_list=None): # Absolute difference: 3.063072442997111e-08 (up to 0.0 allowed) # Relative difference: 6.156719153309558e-06 (up to 1e-06 allowed) "test_log1p_complex_xpu_complex64", + + # CPU MKL::erfinv vs XPU impl. At most 6.e-06 + # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed) + # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed) + "test_reference_numerics_large__refs_erfinv_xpu_float64", + # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed) + # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed) + "test_reference_numerics_large_erfinv_xpu_float64", + # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed) + # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed) + "test_reference_numerics_normal__refs_erfinv_xpu_float64", + # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed) + # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed) + "test_reference_numerics_normal_erfinv_xpu_float64", + # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed) + # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed) + "test_reference_numerics_small__refs_erfinv_xpu_float64", + # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed) + # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed) + "test_reference_numerics_small_erfinv_xpu_float64", ) res += launch_test("test_unary_ufuncs_xpu.py", skip_list) @@ -2745,8 +2761,6 @@ def launch_test(test_case, skip_list=None, exe_list=None): "test_corrcoef_xpu_complex64", ### Error #10 in TestTorchDeviceTypeXPU , totally 1 , AssertionError: True is not false "test_discontiguous_out_cumsum_xpu", - ### Error #11 in TestTorchDeviceTypeXPU , totally 1 , AssertionError: tensor(False, device='xpu:0') is not true - "test_exponential_no_zero_xpu_float16", ### Error #12 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'amp' "test_grad_scaler_pass_itself_xpu", "test_pickle_gradscaler_xpu", diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py index b802270f8..c2f0f15f1 100644 --- a/test/xpu/xpu_test_utils.py +++ b/test/xpu/xpu_test_utils.py @@ -46,6 +46,7 @@ "abs", "erf", "erfc", + "erfinv", "bernoulli", "bitwise_and", "bitwise_not", @@ -62,6 +63,10 @@ "cumsum", "equal", "eq", + "exp", + "exp2", + "expm1", + "exponential", "fill", "fmod", "gcd", diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 15d371e52..b2990fb8a 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -241,6 +241,13 @@ supported: - exp - exp.out - exp_ + - exp2 + - exp2_ + - exp2.out + - expm1 + - expm1_ + - expm1.out + - exponential_ - empty.memory_format - empty_strided - eye.out @@ -551,6 +558,9 @@ supported: - erf - erf_ - erf.out + - erfinv + - erfinv_ + - erfinv.out - linalg_vector_norm - linalg_vector_norm.out - grid_sampler_2d