Merge branch 'main' into raymond/UT_FP64_Win_ARC

intel · Aug 27, 2024 · ccecded · ccecded
2 parents efb9c10 + 455deaf
commit ccecded
Show file tree

Hide file tree

Showing 16 changed files with 63 additions and 68 deletions.
diff --git a/src/ATen/native/xpu/Copy.cpp b/src/ATen/native/xpu/Copy.cpp
@@ -300,7 +300,9 @@ Tensor& XPUNativeFunctions::copy_(
     const Tensor& src,
     bool non_blocking) {
   if (self._is_zerotensor()) {
-    TORCH_CHECK(false, "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor.");
+    TORCH_CHECK(
+        false,
+        "ZeroTensors are immutable. Please materialize the tensor using `.clone()`, if you want a mutable zero tensor.");
   }
   if (src._is_zerotensor()) {
     return self.zero_();
@@ -316,15 +318,12 @@ Tensor& XPUNativeFunctions::copy_(
   // TODO: Support quantization
 
   // Exit early if self and src are views of the same data
-  const bool is_same_data = (
-      self.is_alias_of(src) &&
-      self.storage_offset() == src.storage_offset() &&
-      self.strides().equals(src.strides()) &&
-      self.sizes().equals(src.sizes()) &&
-      self.scalar_type() == src.scalar_type() &&
-      self.is_conj() == src.is_conj() &&
-      self.is_neg() == src.is_neg()
-    );
+  const bool is_same_data =
+      (self.is_alias_of(src) && self.storage_offset() == src.storage_offset() &&
+       self.strides().equals(src.strides()) &&
+       self.sizes().equals(src.sizes()) &&
+       self.scalar_type() == src.scalar_type() &&
+       self.is_conj() == src.is_conj() && self.is_neg() == src.is_neg());
   if (is_same_data) {
     return self;
   }

diff --git a/src/ATen/native/xpu/Resize.cpp b/src/ATen/native/xpu/Resize.cpp
@@ -66,7 +66,8 @@ Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) {
 // For test infrastructure
 Tensor _copy_from(const Tensor& self, const Tensor& dst, bool non_blocking) {
   dst.resize_as_(self);
-  return at::XPUNativeFunctions::copy_(const_cast<Tensor&>(dst), self, non_blocking);
+  return at::XPUNativeFunctions::copy_(
+      const_cast<Tensor&>(dst), self, non_blocking);
 }
 
 // Should not register the operator. Desc of

diff --git a/src/ATen/native/xpu/Sorting.cpp b/src/ATen/native/xpu/Sorting.cpp
@@ -1,12 +1,12 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/op_registration/adaption.h>
+#include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/Sorting.h>
-#include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/xpu/XPUNativeFunctions.h>
-#include <comm/TensorInfo.h>
 #include <comm/RegisterUtils.h>
+#include <comm/TensorInfo.h>
 
 namespace at {
 

diff --git a/src/ATen/native/xpu/UnaryOps.cpp b/src/ATen/native/xpu/UnaryOps.cpp
@@ -1037,7 +1037,6 @@ Tensor& XPUNativeFunctions::ceil_out(const Tensor& self, Tensor& out) {
   return out;
 }
 
-
 Tensor XPUNativeFunctions::round(const Tensor& self) {
   if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/false)) {
     return self.clone();

diff --git a/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp b/src/ATen/native/xpu/sycl/ActivationGeluKernel.cpp
@@ -55,7 +55,7 @@ struct GeluErfFunctor {
     using opmath_t = at::opmath_type<scalar_t>;
     constexpr opmath_t kAlpha = M_SQRT1_2;
     return static_cast<opmath_t>(x) * opmath_t(0.5) *
-        (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+        (opmath_t(1) + std::erf(static_cast<opmath_t>(x) * kAlpha));
   }
 };
 
@@ -66,7 +66,7 @@ struct GeluErfBackwardFunctor {
     constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5);
     constexpr opmath_t kAlpha = M_SQRT1_2;
     const opmath_t cdf = opmath_t(0.5) *
-        (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+        (opmath_t(1) + std::erf(static_cast<opmath_t>(x) * kAlpha));
     const opmath_t pdf = c10::xpu::compat::exp(
                              opmath_t(-0.5) * static_cast<opmath_t>(x) *
                              static_cast<opmath_t>(x)) *

diff --git a/src/ATen/native/xpu/sycl/Atomics.h b/src/ATen/native/xpu/sycl/Atomics.h
@@ -325,9 +325,7 @@ static inline void atomicAdd(
   target.fetch_add(val);
 }
 
-static inline void atomicAdd(
-    const sycl_local_ptr<int>& address,
-    int val) {
+static inline void atomicAdd(const sycl_local_ptr<int>& address, int val) {
   sycl_atomic_ref_rlx_wg_local_t<int> target(*address);
   target.fetch_add(val);
 }

diff --git a/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp b/src/ATen/native/xpu/sycl/BinaryRemainderKernel.cpp
@@ -41,7 +41,7 @@ template <typename scalar_t>
 struct FmodFloatingFunctor {
   scalar_t operator()(scalar_t a, scalar_t b) const
       __ubsan_ignore_float_divide_by_zero__ {
-    return ::fmod(a, b);
+    return std::fmod(a, b);
   }
 };
 

diff --git a/src/ATen/native/xpu/sycl/Math.h b/src/ATen/native/xpu/sycl/Math.h
@@ -33,7 +33,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
     return std::copysign(static_cast<scalar_t>(INFINITY), -x);
   }
 
-  bool x_is_integer = x == ::trunc(x);
+  bool x_is_integer = x == std::trunc(x);
   accscalar_t result = 0;
   if (x < 0) {
     if (x_is_integer) {
@@ -47,8 +47,8 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
     // a periodicity of pi, in practice the computation of pi * x is a source of
     // error (when |x| > 1).
     double q, r;
-    r = ::modf(static_cast<double>(x), &q);
-    result = static_cast<accscalar_t>(-PI_f64 / ::tan(PI_f64 * r));
+    r = std::modf(static_cast<double>(x), &q);
+    result = static_cast<accscalar_t>(-PI_f64 / std::tan(PI_f64 * r));
     x = 1 - x;
   }
 
@@ -72,7 +72,7 @@ static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   }
 
   return static_cast<scalar_t>(
-      ::log(x) - (static_cast<accscalar_t>(0.5) / x) - y + result);
+      std::log(x) - (static_cast<accscalar_t>(0.5) / x) - y + result);
 }
 
 template <typename scalar_t>
@@ -84,7 +84,7 @@ static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) {
   accscalar_t result = 0;
   if (x < 0.5f) {
     sign = -1;
-    accscalar_t sin_pi_x = ::sin(PI * x);
+    accscalar_t sin_pi_x = std::sin(PI * x);
     result -= (PI * PI) / (sin_pi_x * sin_pi_x);
     x = 1 - x;
   }
@@ -190,22 +190,22 @@ static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) {
       "don't instantiate with low precision type");
   // Upcast input for numerical accuracy purposes
   // Needed for accurate results if input is bfloat16 or float16
-  scalar_t x = ::abs(_x);
+  scalar_t x = std::abs(_x);
 
   if (x <= scalar_t{8.0}) {
     auto coeff_pair = chebyshev_coefficients_i0e_A<scalar_t>();
     auto A = std::get<0>(coeff_pair);
     auto len = std::get<1>(coeff_pair);
     scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0};
-    return (::exp(x) * chbevl(y, A, len));
+    return (std::exp(x) * chbevl(y, A, len));
   }
 
   auto coeff_pair = chebyshev_coefficients_i0e_B<scalar_t>();
   auto B = std::get<0>(coeff_pair);
   auto len = std::get<1>(coeff_pair);
   return (
-      ::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) /
-      ::sqrt(x));
+      std::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) /
+      std::sqrt(x));
 }
 
 template <typename T>
@@ -319,28 +319,28 @@ C10_HOST_DEVICE inline typename std::
 
 template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) {
-  const auto x = ::abs(_x);
+  const auto x = std::abs(_x);
   if (x <= scalar_t{8.0}) {
     auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
     auto A = std::get<0>(coeff_pair);
     auto len = std::get<1>(coeff_pair);
     scalar_t y = x / scalar_t{2.0} - scalar_t{2.0};
-    const scalar_t out = ::exp(x) * x * chbevl(y, A, len);
+    const scalar_t out = std::exp(x) * x * chbevl(y, A, len);
     return (_x < scalar_t{0.0}) ? -out : out;
   }
 
   auto coeff_pair = chebyshev_coefficients_i1e_B<scalar_t>();
   auto B = std::get<0>(coeff_pair);
   auto len = std::get<1>(coeff_pair);
   const scalar_t out =
-      (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) /
-      ::sqrt(x);
+      (std::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) /
+      std::sqrt(x);
   return (_x < scalar_t{0.0}) ? -out : out;
 }
 
 template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) {
-  const auto x = ::abs(_x);
+  const auto x = std::abs(_x);
   if (x <= scalar_t{8.0}) {
     auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
     auto A = std::get<0>(coeff_pair);
@@ -354,7 +354,7 @@ static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) {
   auto B = std::get<0>(coeff_pair);
   auto len = std::get<1>(coeff_pair);
   const scalar_t out =
-      chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x);
+      chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / std::sqrt(x);
   return (_x < scalar_t{0.0}) ? -out : out;
 }
 

diff --git a/src/ATen/native/xpu/sycl/Pow.h b/src/ATen/native/xpu/sycl/Pow.h
@@ -47,7 +47,7 @@ powI_(Base_type base, Exp_type exp) {
 #else
 template <typename Base_type, typename Exp_type>
 static inline Base_type pow_(Base_type base, Exp_type exp) {
-  return ::pow(base, exp);
+  return std::pow(base, exp);
 }
 #endif
 

diff --git a/src/ATen/native/xpu/sycl/PowKernels.cpp b/src/ATen/native/xpu/sycl/PowKernels.cpp
@@ -19,7 +19,7 @@ namespace impl {
 
 template <typename Base_type, typename Exp_type>
 static inline Base_type pow_(Base_type base, Exp_type exp) {
-  return ::pow(base, exp);
+  return std::pow(base, exp);
 }
 
 template <typename T>

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
@@ -246,9 +246,8 @@ struct ReduceConfig {
     // in side of SG. It is functional WA. We got case failures on some
     // platforms supporting SIMD8.
     // https://github.com/intel/torch-xpu-ops/issues/698
-    auto max_sg_sz = syclMinSubGroupSize() == 8
-        ? syclMinSubGroupSize()
-        : syclMaxSubGroupSize();
+    auto max_sg_sz = syclMinSubGroupSize() == 8 ? syclMinSubGroupSize()
+                                                : syclMaxSubGroupSize();
     const int max_num_items = max_wg_sz / output_vec_size;
     int dim0_pow2 = dim0 < max_num_items ? static_cast<int>(last_pow2(dim0))
                                          : max_num_items;

diff --git a/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp b/src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
@@ -271,7 +271,7 @@ struct DispatchSoftmaxForwardKernelFunctor
          ++i) {
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
-        sum_value += ::exp(reg_in[i][j] - max_value);
+        sum_value += std::exp(reg_in[i][j] - max_value);
       }
     }
     if (local_size_ > 1) {
@@ -285,7 +285,7 @@ struct DispatchSoftmaxForwardKernelFunctor
           [](accscalar_t a, accscalar_t b) { return a + b; });
     }
     if constexpr (LogSoftMax)
-      sum_value = ::log(sum_value);
+      sum_value = std::log(sum_value);
     else if (sum_value != 0)
       sum_value = accscalar_t(1) / sum_value;
 
@@ -305,7 +305,7 @@ struct DispatchSoftmaxForwardKernelFunctor
           reg_in[i][j] = nan_;
         } else {
           reg_in[i][j] = static_cast<scalar_t>(
-              ::exp(reg_in[i][j] - max_value) * sum_value);
+              std::exp(reg_in[i][j] - max_value) * sum_value);
         }
       }
       *(reinterpret_cast<vec_t*>(out_data_ + group_offset + index)) = reg_in[i];
@@ -520,13 +520,13 @@ struct SoftmaxForwardKernelFunctor {
       for (int j = 0; j < vec_size; ++j) {
         IndexType linear_idx = i * vec_size + j - start;
         if (linear_idx >= 0 && linear_idx < dim_size_)
-          sum_value += ::exp(accscalar_t(in_val[j]) - max_value);
+          sum_value += std::exp(accscalar_t(in_val[j]) - max_value);
       }
     }
     sum_value = sycl::reduce_over_group(
         item.get_group(), sum_value, sycl::plus<accscalar_t>());
     if (LogSoftMax)
-      sum_value = ::log(sum_value);
+      sum_value = std::log(sum_value);
     else
       sum_value = accscalar_t(1) / sum_value;
 
@@ -543,7 +543,7 @@ struct SoftmaxForwardKernelFunctor {
                   in_data_[group_offset + linear_idx] - max_value - sum_value);
             else
               out_data_[group_offset + linear_idx] = static_cast<scalar_t>(
-                  ::exp(in_data_[group_offset + linear_idx] - max_value) *
+                  std::exp(in_data_[group_offset + linear_idx] - max_value) *
                   sum_value);
           }
         }
@@ -556,8 +556,8 @@ struct SoftmaxForwardKernelFunctor {
             in_val[j] =
                 static_cast<scalar_t>(in_val[j] - max_value - sum_value);
           else
-            in_val[j] =
-                static_cast<scalar_t>(::exp(in_val[j] - max_value) * sum_value);
+            in_val[j] = static_cast<scalar_t>(
+                std::exp(in_val[j] - max_value) * sum_value);
         }
         *(reinterpret_cast<vec_t*>(
             out_data_ + group_offset - start + i * vec_size)) = in_val;
@@ -670,14 +670,14 @@ struct SpatialSoftmaxForwardKernelFunctor
     value = *(reinterpret_cast<vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
     for (int j = 0; j < vec_size; ++j) {
-      sum_value[j] = ::exp(value[j] - max_value[j]);
+      sum_value[j] = std::exp(value[j] - max_value[j]);
     }
     for (int i = local_row_id + block_row_; i < dim_size_; i += block_row_) {
       offset = i * inner_size_ + global_col * vec_size;
       value = *(reinterpret_cast<vec_t*>(in_data_ + group_offset + offset));
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
-        sum_value[j] += ::exp(value[j] - max_value[j]);
+        sum_value[j] += std::exp(value[j] - max_value[j]);
       }
     }
     if (block_row_ > 1) {
@@ -690,15 +690,15 @@ struct SpatialSoftmaxForwardKernelFunctor
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         if (LogSoftMax)
-          sum_value[j] = ::log(local_data_[0][local_col_id][j]);
+          sum_value[j] = std::log(local_data_[0][local_col_id][j]);
         else
           sum_value[j] = accscalar_t(1) / local_data_[0][local_col_id][j];
       }
     } else {
 #pragma unroll(vec_size)
       for (int j = 0; j < vec_size; ++j) {
         if (LogSoftMax)
-          sum_value[j] = ::log(sum_value[j]);
+          sum_value[j] = std::log(sum_value[j]);
         else
           sum_value[j] = accscalar_t(1) / sum_value[j];
       }
@@ -717,7 +717,7 @@ struct SpatialSoftmaxForwardKernelFunctor
                 static_cast<scalar_t>(in_val[j] - max_value[j] - sum_value[j]);
           else
             in_val[j] = static_cast<scalar_t>(
-                ::exp(in_val[j] - max_value[j]) * sum_value[j]);
+                std::exp(in_val[j] - max_value[j]) * sum_value[j]);
         }
         *(reinterpret_cast<vec_t*>(out_data_ + group_offset + offset)) = in_val;
       }
@@ -890,7 +890,7 @@ struct DispatchSoftmaxBackwardKernelFunctor
       for (int j = 0; j < vec_size; ++j) {
         if (LogSoftMax) {
           reg_out[i][j] = static_cast<scalar_t>(
-              reg_gradout[i][j] - ::exp(reg_out[i][j]) * sum_value);
+              reg_gradout[i][j] - std::exp(reg_out[i][j]) * sum_value);
         } else {
           reg_out[i][j] = static_cast<scalar_t>(
               reg_out[i][j] * (reg_gradout[i][j] - sum_value));
@@ -1117,7 +1117,7 @@ struct SoftmaxBackwardKernelFunctor {
             auto offset = group_offset + linear_idx;
             if (LogSoftMax) {
               gradInput_[offset] =
-                  gradOutput_[offset] - ::exp(output_[offset]) * sum_value;
+                  gradOutput_[offset] - std::exp(output_[offset]) * sum_value;
             } else {
               gradInput_[offset] =
                   output_[offset] * (gradOutput_[offset] - sum_value);
@@ -1130,7 +1130,7 @@ struct SoftmaxBackwardKernelFunctor {
 #pragma unroll(vec_size)
         for (int j = 0; j < vec_size; ++j) {
           if (LogSoftMax) {
-            out_val[j] = grad_val[j] - ::exp(out_val[j]) * sum_value;
+            out_val[j] = grad_val[j] - std::exp(out_val[j]) * sum_value;
           } else {
             out_val[j] = out_val[j] * (grad_val[j] - sum_value);
           }
@@ -1259,7 +1259,7 @@ struct SpatialSoftmaxBackwardKernelFunctor
         for (int j = 0; j < vec_size; ++j) {
           if (LogSoftMax) {
             out_val[j] = static_cast<scalar_t>(
-                gradout_val[j] - ::exp(out_val[j]) * sum_value[j]);
+                gradout_val[j] - std::exp(out_val[j]) * sum_value[j]);
           } else {
             out_val[j] = static_cast<scalar_t>(
                 out_val[j] * (gradout_val[j] - sum_value[j]));