diff --git a/src/ATen/native/xpu/Distributions.cpp b/src/ATen/native/xpu/Distributions.cpp
index a2e89743a..51ff727cc 100644
--- a/src/ATen/native/xpu/Distributions.cpp
+++ b/src/ATen/native/xpu/Distributions.cpp
@@ -191,6 +191,24 @@ Tensor& XPUNativeFunctions::random_(
   return random_(self, 0, to, std::move(generator));
 }
 
+template <typename RNG>
+struct ExponentialStub {
+  void operator()(
+      TensorIteratorBase& iter,
+      double lambda,
+      c10::optional<Generator> gen) {
+    native::xpu::exponential_kernel(iter, lambda, gen);
+  }
+};
+
+Tensor& XPUNativeFunctions::exponential_(
+    Tensor& self,
+    double lambda,
+    std::optional<Generator> generator) {
+  return native::templates::exponential_impl_<ExponentialStub, Generator>(
+      self, lambda, std::move(generator));
+}
+
 /* The largest consecutive integer representable in float32 (2^24) */
 constexpr int64_t FLOAT32_MAX_CONSECUTIVE_INT = 1 << (24);
 
diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
index 1222bfc4f..1f9c8e3c0 100644
--- a/src/ATen/native/xpu/UnaryOps.cpp
+++ b/src/ATen/native/xpu/UnaryOps.cpp
@@ -613,6 +613,72 @@ Tensor& XPUNativeFunctions::erfc_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
+Tensor XPUNativeFunctions::erfinv(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::erfinv_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::erfinv_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::erfinv_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::erfinv_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::erfinv_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::exp2(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::exp2_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::exp2_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::exp2_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::exp2_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::exp2_kernel(iter);
+  return out;
+}
+
+Tensor XPUNativeFunctions::expm1(const Tensor& self) {
+  Tensor out;
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::expm1_kernel(iter);
+  return iter.output();
+}
+
+Tensor& XPUNativeFunctions::expm1_(Tensor& self) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(self, self);
+  native::xpu::expm1_kernel(iter);
+  return self;
+}
+
+Tensor& XPUNativeFunctions::expm1_out(const Tensor& self, Tensor& out) {
+  TensorIterator iter;
+  iter.build_borrowing_unary_float_op(out, self);
+  native::xpu::expm1_kernel(iter);
+  return out;
+}
+
 Tensor XPUNativeFunctions::frac(const Tensor& self) {
   Tensor out;
   TensorIterator iter;
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
index 3341a8e7c..75274d4cf 100644
--- a/src/ATen/native/xpu/XPUFallback.template
+++ b/src/ATen/native/xpu/XPUFallback.template
@@ -179,10 +179,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_efficient_attention_forward",
     "_embedding_bag_dense_backward",
     "_embedding_bag_per_sample_weights_backward",
-    "erfinv.out",
-    "exp2.out",
-    "expm1.out",
-    "exponential_",
     "_fft_c2c",
     "_fft_c2r",
     "_fft_r2c",
diff --git a/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
new file mode 100644
index 000000000..4c1b83689
--- /dev/null
+++ b/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp
@@ -0,0 +1,21 @@
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/DistributionTemplates.h>
+#include <ATen/native/xpu/sycl/Philox4x32.h>
+#include <ATen/xpu/XPUGeneratorImpl.h>
+#include <comm/DeviceProperties.h>
+#include <comm/Runtime.h>
+
+namespace at::native::xpu {
+
+void exponential_kernel(
+    TensorIteratorBase& iter,
+    double lambda,
+    c10::optional<Generator> gen) {
+  auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
+      gen, at::xpu::detail::getDefaultXPUGenerator());
+  at::native::templates::xpu::exponential_kernel(iter, lambda, generator);
+}
+
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionKernels.h b/src/ATen/native/xpu/sycl/DistributionKernels.h
index ce1787f51..e5700f26e 100644
--- a/src/ATen/native/xpu/sycl/DistributionKernels.h
+++ b/src/ATen/native/xpu/sycl/DistributionKernels.h
@@ -38,4 +38,9 @@ void bernoulli_scalar_kernel(
     double p,
     c10::optional<Generator> gen);
 
+void exponential_kernel(
+    TensorIteratorBase& iter,
+    double lambda,
+    c10::optional<Generator> gen);
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/DistributionTemplates.h b/src/ATen/native/xpu/sycl/DistributionTemplates.h
index e4345bfad..851263e56 100644
--- a/src/ATen/native/xpu/sycl/DistributionTemplates.h
+++ b/src/ATen/native/xpu/sycl/DistributionTemplates.h
@@ -591,6 +591,51 @@ void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
       });
 }
 
+// ====================== Exponential ======================
+
+template <typename scalar_t, typename accscalar_t>
+struct ExponentialFunctor {
+  auto operator()(accscalar_t val) const {
+    // BEFORE TOUCHING THIS CODE READ:
+    // https://github.com/pytorch/pytorch/issues/16706
+    // rand_uniform has (0,1] bounds. log(1) is 0 and exponential
+    // excludes 0. we need log to be not 0, and not underflow when
+    // converted to half
+    accscalar_t log;
+    if (val >= static_cast<accscalar_t>(1.f) -
+            std::numeric_limits<scalar_t>::epsilon() / 2.f) {
+      log = -std::numeric_limits<scalar_t>::epsilon() / 2.f;
+    } else {
+      log = std::log(val);
+    }
+    return static_cast<accscalar_t>(-1.f) / lambd_ * log;
+  }
+  ExponentialFunctor(accscalar_t lambd) : lambd_(lambd) {}
+
+ private:
+  accscalar_t lambd_;
+};
+
+template <typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG gen) {
+  TORCH_CHECK(
+      isFloatingType(iter.dtype()),
+      "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ",
+      iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "exponential__xpu_",
+      [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+        auto lambd = static_cast<accscalar_t>(lambda);
+        ExponentialFunctor<scalar_t, accscalar_t> exponential_func(lambd);
+        uniform_and_transform<scalar_t, accscalar_t, rand4_engine_calls>(
+            iter, gen, exponential_func);
+      });
+}
+
 } // namespace xpu
 } // namespace templates
 } // namespace native
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
index e13572dcb..2ba855d34 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -224,4 +224,30 @@ void nan_to_num_kernel(
   }
 }
 
+template <typename scalar_t>
+struct Expm1Functor {
+  scalar_t operator()(scalar_t a) const {
+    return std::expm1(a);
+  }
+};
+
+template <typename T>
+struct Expm1Functor<c10::complex<T>> {
+  c10::complex<T> operator()(c10::complex<T> x) const {
+    auto a = std::sin(.5 * x.imag());
+    auto re = std::expm1(x.real()) * std::cos(x.imag()) - 2 * a * a;
+    auto im = std::exp(x.real()) * std::sin(x.imag());
+    return c10::complex<T>(re, im);
+  }
+};
+
+void expm1_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "expm1_xpu",
+      [&]() { gpu_kernel(iter, Expm1Functor<scalar_t>()); });
+}
+
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.h b/src/ATen/native/xpu/sycl/UnaryKernels.h
index dab9b2808..cc394de6b 100644
--- a/src/ATen/native/xpu/sycl/UnaryKernels.h
+++ b/src/ATen/native/xpu/sycl/UnaryKernels.h
@@ -12,6 +12,8 @@ void bitwise_not_kernel(TensorIteratorBase& iter);
 
 void exp_kernel(TensorIteratorBase& iter);
 
+void expm1_kernel(TensorIteratorBase& iter);
+
 void nan_to_num_kernel(
     TensorIteratorBase& iter,
     std::optional<double> nan,
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
index 586b0f0b1..b7d0b8974 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.cpp
@@ -77,6 +77,106 @@ void erfc_kernel(TensorIteratorBase& iter) {
       [&]() { gpu_kernel(iter, ErfcFunctor<scalar_t>()); });
 }
 
+template <typename scalar_t>
+struct ErfinvFunctor {
+  using opmath_type = at::opmath_type<scalar_t>;
+
+  scalar_t operator()(scalar_t in) const {
+    scalar_t out;
+    opmath_type z, num, dem;
+
+    auto x = static_cast<opmath_type>(in);
+    if (std::fabs(x) > 1.0f) {
+      out = static_cast<scalar_t>(NAN);
+      return out;
+    }
+    if (std::fabs(x) == 1.0f) {
+      out = static_cast<scalar_t>(
+          (std::copysign(1.0, static_cast<double>(x))) *
+          (std::numeric_limits<double>::infinity()));
+      return out;
+    }
+    if (std::fabs(x) <= 0.7f) {
+      z = x * x;
+      num = (((a_[3] * z + a_[2]) * z + a_[1]) * z + a_[0]);
+      dem =
+          ((((b_[3] * z + b_[2]) * z + b_[1]) * z + b_[0]) * z +
+           static_cast<opmath_type>(1.0));
+      out = x * num / dem;
+    } else {
+      z = static_cast<opmath_type>(
+          std::sqrt(-std::log((1.0 - std::fabs(x)) / 2.0)));
+      num = ((c_[3] * z + c_[2]) * z + c_[1]) * z + c_[0];
+      dem = (d_[1] * z + d_[0]) * z + static_cast<opmath_type>(1.0);
+      out = static_cast<scalar_t>(
+          static_cast<opmath_type>(std::copysign(1.0, static_cast<double>(x))) *
+          num / dem);
+    }
+    out = out -
+        static_cast<scalar_t>(
+              (std::erf(static_cast<double>(out)) - x) /
+              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
+    out = out -
+        static_cast<scalar_t>(
+              (std::erf(static_cast<double>(out)) - x) /
+              ((2.0 / std::sqrt(PI_f64_)) * std::exp(-x * x)));
+    return out;
+  }
+
+  static constexpr double PI_f64_ = 3.14159265358979323846;
+  static constexpr std::array<opmath_type, 4> a_ = {
+      0.886226899,
+      -1.645349621,
+      0.914624893,
+      -0.140543331};
+  static constexpr std::array<opmath_type, 4> b_ = {
+      -2.118377725,
+      1.442710462,
+      -0.329097515,
+      0.012229801};
+  static constexpr std::array<opmath_type, 4> c_ = {
+      -1.970840454,
+      -1.624906493,
+      3.429567803,
+      1.641345311};
+  static constexpr std::array<opmath_type, 2> d_ = {3.543889200, 1.637067800};
+};
+
+void erfinv_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      iter.common_dtype(),
+      "erfinv_xpu",
+      [&]() { gpu_kernel(iter, ErfinvFunctor<scalar_t>()); });
+}
+
+template <typename scalar_t>
+struct Exp2Functor {
+  scalar_t operator()(scalar_t a) const {
+    return std::exp2(a);
+  }
+};
+
+template <typename T>
+struct Exp2Functor<c10::complex<T>> {
+  c10::complex<T> operator()(c10::complex<T> x) const {
+    // There is no std::exp2 overload for complex, so instead
+    // use the identity 2^x = e^(ln(2) * x)
+    const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
+    return std::exp(ln_2 * x);
+  }
+};
+
+void exp2_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "exp2_xpu",
+      [&]() { gpu_kernel(iter, Exp2Functor<scalar_t>()); });
+}
+
 template <typename scalar_t>
 struct Logit0Functor {
   using T_ACC = acc_type_device<scalar_t, c10::DeviceType::XPU>;
diff --git a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
index 447a1d7af..6bed9c6de 100644
--- a/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
+++ b/src/ATen/native/xpu/sycl/UnarySpecialOpsKernels.h
@@ -10,6 +10,10 @@ void erf_kernel(TensorIteratorBase& iter);
 
 void erfc_kernel(TensorIteratorBase& iter);
 
+void erfinv_kernel(TensorIteratorBase& iter);
+
+void exp2_kernel(TensorIteratorBase& iter);
+
 void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar);
 
 } // namespace at::native::xpu
diff --git a/test/xpu/extended/run_test_with_skip.py b/test/xpu/extended/run_test_with_skip.py
index 7318e25fe..d50f14d53 100644
--- a/test/xpu/extended/run_test_with_skip.py
+++ b/test/xpu/extended/run_test_with_skip.py
@@ -10,6 +10,7 @@
     #     a. Different kernel implementations.
     #     b. Different std functions. (std::log, std::tanh, std::exp)
     # 5. The result of division between two same float values is not 1.
+    # 6. std functions get different results when input is nan or inf between GCC and SYCL.
     "test_compare_cpu_cumsum_xpu_bfloat16",
     "test_compare_cpu_cumsum_xpu_float16",
     "test_compare_cpu_log_xpu_complex64",
@@ -34,10 +35,9 @@
     "test_compare_cpu_cross_xpu_float16",
     "test_compare_cpu_floor_divide_xpu_bfloat16",
     "test_compare_cpu_floor_divide_xpu_float16",
-
-    # got inconsistent values between CPU / XPU
-    # AssertionError: Tensor-likes are not close!
-    # compute results contain nan / inf
+    "test_compare_cpu_exp_xpu_bfloat16",
+    "test_compare_cpu_exp_xpu_complex128",
+    "test_compare_cpu_exp_xpu_complex64",
     "test_compare_cpu_acosh_xpu_complex64",
     "test_compare_cpu_asin_xpu_complex128",
     "test_compare_cpu_asin_xpu_complex64",
@@ -45,6 +45,8 @@
     "test_compare_cpu_asinh_xpu_complex64",
     "test_compare_cpu_atan_xpu_complex128",
     "test_compare_cpu_atan_xpu_complex64",
+    "test_compare_cpu_exp2_xpu_complex128",
+    "test_compare_cpu_exp2_xpu_complex64",
 
     # skip random failure due to accuracy
     # AssertionError: Tensor-likes are not close!
diff --git a/test/xpu/run_test_with_skip.py b/test/xpu/run_test_with_skip.py
index ca4664bad..7d71d50c1 100644
--- a/test/xpu/run_test_with_skip.py
+++ b/test/xpu/run_test_with_skip.py
@@ -1464,12 +1464,8 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "_jiterator_",
     # CPU Fallback fails: Tensor-likes are not close!
     "test_reference_numerics_extremal__refs_acos_xpu_complex128",
-    "test_reference_numerics_extremal__refs_exp2_xpu_complex128",
-    "test_reference_numerics_extremal__refs_exp2_xpu_complex64",
     "test_reference_numerics_extremal__refs_nn_functional_tanhshrink_xpu_complex64",
     "test_reference_numerics_extremal_acos_xpu_complex128",
-    "test_reference_numerics_extremal_exp2_xpu_complex128",
-    "test_reference_numerics_extremal_exp2_xpu_complex64",
     "test_reference_numerics_extremal_nn_functional_tanhshrink_xpu_complex64",
     "test_reference_numerics_normal__refs_nn_functional_tanhshrink_xpu_complex64",
     "test_reference_numerics_normal_nn_functional_tanhshrink_xpu_complex64",
@@ -1531,6 +1527,26 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     # Absolute difference: 3.063072442997111e-08 (up to 0.0 allowed)
     # Relative difference: 6.156719153309558e-06 (up to 1e-06 allowed)
     "test_log1p_complex_xpu_complex64",
+
+    # CPU MKL::erfinv vs XPU impl. At most 6.e-06
+    # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed)
+    "test_reference_numerics_large__refs_erfinv_xpu_float64",
+    # Greatest absolute difference: 5.250126961175994e-06 at index (0,) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.680894105274219e-06 at index (0,) (up to 1e-07 allowed)
+    "test_reference_numerics_large_erfinv_xpu_float64",
+    # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed)
+    "test_reference_numerics_normal__refs_erfinv_xpu_float64",
+    # Greatest absolute difference: 4.829411781148707e-06 at index (690, 855) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.5588752485769885e-06 at index (690, 855) (up to 1e-07 allowed)
+    "test_reference_numerics_normal_erfinv_xpu_float64",
+    # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed)
+    "test_reference_numerics_small__refs_erfinv_xpu_float64",
+    # Greatest absolute difference: 5.250126961175994e-06 at index (96,) (up to 1e-07 allowed)
+    # Greatest relative difference: 1.680894105274219e-06 at index (96,) (up to 1e-07 allowed)
+    "test_reference_numerics_small_erfinv_xpu_float64",
 )
 res += launch_test("test_unary_ufuncs_xpu.py", skip_list)
 
@@ -2745,8 +2761,6 @@ def launch_test(test_case, skip_list=None, exe_list=None):
     "test_corrcoef_xpu_complex64",
     ### Error #10 in TestTorchDeviceTypeXPU , totally 1 , AssertionError: True is not false
     "test_discontiguous_out_cumsum_xpu",
-    ### Error #11 in TestTorchDeviceTypeXPU , totally 1 , AssertionError: tensor(False, device='xpu:0') is not true
-    "test_exponential_no_zero_xpu_float16",
     ### Error #12 in TestTorchDeviceTypeXPU , totally 2 , AttributeError: module 'torch.xpu' has no attribute 'amp'
     "test_grad_scaler_pass_itself_xpu",
     "test_pickle_gradscaler_xpu",
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
index b802270f8..c2f0f15f1 100644
--- a/test/xpu/xpu_test_utils.py
+++ b/test/xpu/xpu_test_utils.py
@@ -46,6 +46,7 @@
     "abs",
     "erf",
     "erfc",
+    "erfinv",
     "bernoulli",
     "bitwise_and",
     "bitwise_not",
@@ -62,6 +63,10 @@
     "cumsum",
     "equal",
     "eq",
+    "exp",
+    "exp2",
+    "expm1",
+    "exponential",
     "fill",
     "fmod",
     "gcd",
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 15d371e52..b2990fb8a 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -241,6 +241,13 @@ supported:
   - exp
   - exp.out
   - exp_
+  - exp2
+  - exp2_
+  - exp2.out
+  - expm1
+  - expm1_
+  - expm1.out
+  - exponential_
   - empty.memory_format
   - empty_strided
   - eye.out
@@ -551,6 +558,9 @@ supported:
   - erf
   - erf_
   - erf.out
+  - erfinv
+  - erfinv_
+  - erfinv.out
   - linalg_vector_norm
   - linalg_vector_norm.out
   - grid_sampler_2d