SYCL: using SYCL group algorithm API instead of old style for sub gro…

…up shift utilities (#635) Change 1: SYCL is shifting to SYCL group algorithm API to unify sub-group, work-group APIs, e.g. reduce_over_group, group_barrier, and shift_group here. And old style (use separate member function of each structure) is being deprecated. | Old | SYCL-2020| | ------------- | ------------- | | sg.shuffle_down(x, 1) | sycl::shift_group_left(sg, x, 1) | | sg.shuffle_up(x, 1) | sycl::shift_group_right(sg, x, 1) | | sg.shuffle(x, id) | sycl::select_from_group(sg, x, id) | | sg.shuffle_xor(x, mask) | sycl::permute_group_by_xor(sg, x, mask) | FYI: Please **don't** use sg.shuffle members anymore since they are deprecated. Change 2: `sycl::shfit_group_xxx` is more restricted than `sg.shuffle`, requiring the object shifted is C++ trivially copyable object. We implemented private `pair` instead of `std::pair` in the commit. FYI: CUDA is using `thrust::pair` in kernel. --------- Co-authored-by: Feng Yuan <[email protected]> Co-authored-by: xytintel <[email protected]>
intel · Aug 2, 2024 · 9ea0728 · 9ea0728
1 parent 35bea25
commit 9ea0728
Show file tree

Hide file tree

Showing 16 changed files with 571 additions and 63 deletions.
diff --git a/src/ATen/native/xpu/sycl/BatchNormKernels.cpp b/src/ATen/native/xpu/sycl/BatchNormKernels.cpp
@@ -263,7 +263,8 @@ static inline void group_reduce(
   // uint32_t SIMD = sg.get_local_range()[0];
 #pragma unroll
   for (int i = 1; i < SIMD; i <<= 1) {
-    val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+    val = bin_op(
+        val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
   }
   if (sub_group_num == 1) {
     if (lane_id == 0) {
@@ -294,7 +295,8 @@ static inline void group_reduce(
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      val = bin_op(val, static_cast<accscalar_t>(sg.shuffle_down(val, i)));
+      val = bin_op(
+          val, static_cast<accscalar_t>(sycl::shift_group_left(sg, val, i)));
       if (i >= ((sub_group_num + 1) >> 1))
         break;
     }
@@ -450,10 +452,10 @@ struct BatchNormCollectStatisticsKernelFunctor
     // one value per subgroup
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      stat_accscalar_t o_avg = sg.shuffle_xor(avg, i);
-      int o_n = sg.shuffle_xor(n, i);
+      stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i);
+      int o_n = sycl::permute_group_by_xor(sg, n, i);
       stat_accscalar_t factor = 1.0 / fmaxf(1.0, n + o_n);
-      var_n += sg.shuffle_xor(var_n, i) +
+      var_n += sycl::permute_group_by_xor(sg, var_n, i) +
           (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
       avg = (n * avg + o_n * o_avg) * factor;
       n += o_n;
@@ -481,10 +483,10 @@ struct BatchNormCollectStatisticsKernelFunctor
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      stat_accscalar_t o_avg = sg.shuffle_xor(avg, i);
-      int o_n = sg.shuffle_xor(n, i);
+      stat_accscalar_t o_avg = sycl::permute_group_by_xor(sg, avg, i);
+      int o_n = sycl::permute_group_by_xor(sg, n, i);
       stat_accscalar_t factor = 1.0f / fmaxf(1.0f, n + o_n);
-      var_n += sg.shuffle_xor(var_n, i) +
+      var_n += sycl::permute_group_by_xor(sg, var_n, i) +
           (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
       avg = (n * avg + o_n * o_avg) * factor;
       n += o_n;

diff --git a/src/ATen/native/xpu/sycl/DistanceKernels.cpp b/src/ATen/native/xpu/sycl/DistanceKernels.cpp
@@ -120,7 +120,7 @@ scalar_t subgroup_reduce_agg_without_broadcast_impl(
 
 #pragma unroll
   for (int offset = (SG_SIZE >> 1); offset > 0; offset >>= 1) {
-    F::agg(value, sg.shuffle_down(value, offset));
+    F::agg(value, sycl::shift_group_left(sg, value, offset));
   }
   return value;
 }

diff --git a/src/ATen/native/xpu/sycl/GroupNormKernels.cpp b/src/ATen/native/xpu/sycl/GroupNormKernels.cpp
@@ -3,10 +3,10 @@
 #include <ATen/Dispatch.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/CanUse32BitIndexMath.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/GroupReduceUtils.h>
 #include <ATen/native/xpu/sycl/Loops.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 #include <comm/MemoryFormat.h>
 #include <comm/XPUMathCompat.h>
 
@@ -18,23 +18,23 @@ template <
     typename index_t,
     typename res_t>
 struct WelfordOpsXPU
-    : public at::native::WelfordOps<scalar_t, acc_scalar_t, index_t, res_t> {
+    : public WelfordOps<scalar_t, acc_scalar_t, index_t, res_t> {
   sycl::nd_item<1>& item;
 
  public:
-  using acc_t = typename at::native::
-      WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>::acc_t;
+  using acc_t =
+      typename WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>::acc_t;
   inline acc_t shfl_down(acc_t acc, int offset) const {
     auto sg = item.get_sub_group();
     return {
-        sg.shuffle_down(acc.mean, offset),
-        sg.shuffle_down(acc.m2, offset),
-        sg.shuffle_down(acc.n, offset),
-        sg.shuffle_down(acc.nf, offset)};
+        sycl::shift_group_left(sg, acc.mean, offset),
+        sycl::shift_group_left(sg, acc.m2, offset),
+        sycl::shift_group_left(sg, acc.n, offset),
+        sycl::shift_group_left(sg, acc.nf, offset)};
   }
 
   WelfordOpsXPU(acc_scalar_t correction, bool take_sqrt, sycl::nd_item<1>& item)
-      : at::native::WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>(
+      : WelfordOps<scalar_t, acc_scalar_t, index_t, res_t>(
             correction,
             take_sqrt),
         item(item) {}
@@ -43,7 +43,7 @@ struct WelfordOpsXPU
 template <typename T, int SIMD>
 struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
   using T_ACC = acc_type_device<T, kXPU>;
-  using WelfordType = at::native::WelfordData<T_ACC, int64_t>;
+  using WelfordType = WelfordData<T_ACC, int64_t>;
   using WelfordOp =
       WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
 

diff --git a/src/ATen/native/xpu/sycl/Norm.h b/src/ATen/native/xpu/sycl/Norm.h
@@ -39,8 +39,10 @@ static inline void norm_group_reduce(
   // uint32_t SIMD = sg.get_local_range()[0];
 #pragma unroll
   for (int i = 1; i < SIMD; i <<= 1) {
-    sum1 = bin_op(sum1, static_cast<accscalar_t>(sg.shuffle_down(sum1, i)));
-    sum2 = bin_op(sum2, static_cast<accscalar_t>(sg.shuffle_down(sum2, i)));
+    sum1 = bin_op(
+        sum1, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum1, i)));
+    sum2 = bin_op(
+        sum2, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum2, i)));
   }
   if (sub_group_num == 1) {
     sum1 = sycl::group_broadcast(sg, sum1, 0);
@@ -73,8 +75,10 @@ static inline void norm_group_reduce(
     }
 #pragma unroll
     for (int i = 1; i < SIMD; i <<= 1) {
-      sum1 = bin_op(sum1, static_cast<accscalar_t>(sg.shuffle_down(sum1, i)));
-      sum2 = bin_op(sum2, static_cast<accscalar_t>(sg.shuffle_down(sum2, i)));
+      sum1 = bin_op(
+          sum1, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum1, i)));
+      sum2 = bin_op(
+          sum2, static_cast<accscalar_t>(sycl::shift_group_left(sg, sum2, i)));
       if (i >= ((sub_group_num + 1) >> 1))
         break;
     }

diff --git a/src/ATen/native/xpu/sycl/Reduce.h b/src/ATen/native/xpu/sycl/Reduce.h
@@ -13,6 +13,7 @@
 #include <c10/macros/Macros.h>
 #include <comm/DeviceProperties.h>
 #include <comm/SYCLContext.h>
+#include <comm/XPUPair.h>
 #include <functional>
 #include <iosfwd>
 #include <type_traits>
@@ -50,7 +51,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_reduce(
   for (int offset = 1; offset < sg_size; offset <<= 1) {
 #pragma unroll(out_vec_sz)
     for (int i = 0; i < out_vec_sz; ++i) {
-      arg_t other = sg.shuffle_down(value[i], offset);
+      arg_t other = sycl::shift_group_left(sg, value[i], offset);
       value[i] = combine(value[i], other);
     }
   }
@@ -71,7 +72,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_reduce(
       for (int offset = 1; offset < sg_range; offset <<= 1) {
 #pragma unroll(out_vec_sz)
         for (int i = 0; i < out_vec_sz; ++i) {
-          arg_t other = sg.shuffle_down(value[i], offset);
+          arg_t other = sycl::shift_group_left(sg, value[i], offset);
           value[i] = combine(value[i], other);
         }
       }
@@ -132,7 +133,7 @@ inline at::detail::Array<arg_t, out_vec_sz> group_x_reduce(
   for (int offset = 1; offset < dim_x; offset <<= 1) {
 #pragma unroll(out_vec_sz)
     for (int i = 0; i < out_vec_sz; ++i) {
-      arg_t other = sg.shuffle_down(value[i], offset);
+      arg_t other = sycl::shift_group_left(sg, value[i], offset);
       value[i] = combine(value[i], other);
     }
   }
@@ -541,11 +542,11 @@ struct ReduceOp {
           (const scalar_t*)((const char*)src + base_offsets1);
       value = item_reduce<output_vec_size>(pos, input_slice);
     }
-    // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-    // pair for half dtype, We temporarily workaround to do
+    // TODO: Currently, there are bugs with sycl::shift_group_left when the
+    // arg_t is a pair for half dtype, We temporarily workaround to do
     // "reduce_for_compound_dtype" function.
     constexpr bool is_pair =
-        std::is_same<std::pair<scalar_t, int64_t>, arg_t>::value;
+        std::is_same<at::xpu::pair<scalar_t, int64_t>, arg_t>::value;
 
     auto combine = [=](arg1_t value, arg2_t other) -> arg1_t {
       return ops.combine(value, other);
@@ -832,8 +833,8 @@ struct ReduceOp {
     return value_list[0];
   }
 
-  // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-  // pair with half dtype, We temporarily workaround to do
+  // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t
+  // is a pair with half dtype, We temporarily workaround to do
   // "reduce_for_compound_dtype" function.
   template <int output_vec_size>
   at::detail::Array<arg_t, output_vec_size> group_reduce_for_compound_dtype(
@@ -850,7 +851,7 @@ struct ReduceOp {
     for (int offset = 1; offset < (int)sbgrpSize; offset <<= 1) {
 #pragma unroll(output_vec_size)
       for (int i = 0; i < output_vec_size; ++i) {
-        arg_t other = sg.shuffle_down(value[i], offset);
+        arg_t other = sycl::shift_group_left(sg, value[i], offset);
         value[i] = ops.combine(value[i], other);
       }
     }
@@ -875,12 +876,13 @@ struct ReduceOp {
 #pragma unroll(output_vec_size)
           for (int i = 0; i < output_vec_size; ++i) {
             // Shuffle down separately for first and second pair.
-            std::pair<typename arg_t::first_type, typename arg_t::second_type>
-                other = std::pair<
-                    typename arg_t::first_type,
-                    typename arg_t::second_type>(
-                    sg.shuffle_down(value[i].first, offset),
-                    sg.shuffle_down(value[i].second, offset));
+            at::xpu::
+                pair<typename arg_t::first_type, typename arg_t::second_type>
+                    other = at::xpu::pair<
+                        typename arg_t::first_type,
+                        typename arg_t::second_type>(
+                        sycl::shift_group_left(sg, value[i].first, offset),
+                        sycl::shift_group_left(sg, value[i].second, offset));
             value[i] = ops.combine(value[i], other);
           }
         }
@@ -907,8 +909,8 @@ struct ReduceOp {
     return value;
   }
 
-  // TODO: Currently, there are bugs with shuffle_down when the arg_t is a
-  // pair for half dtype, We temporarily workaround to do
+  // TODO: Currently, there are bugs with sycl::shift_group_left when the arg_t
+  // is a pair for half dtype, We temporarily workaround to do
   // "reduce_for_compound_dtype" function.
   template <int output_vec_size>
   at::detail::Array<arg_t, output_vec_size> group_x_reduce_for_compound_dtype(
@@ -947,11 +949,11 @@ struct ReduceOp {
     for (int offset = 1; offset < dim_x; offset <<= 1) {
 #pragma unroll(output_vec_size)
       for (int i = 0; i < output_vec_size; ++i) {
-        std::pair<typename arg_t::first_type, typename arg_t::second_type>
-            other = std::
+        at::xpu::pair<typename arg_t::first_type, typename arg_t::second_type>
+            other = xpu::
                 pair<typename arg_t::first_type, typename arg_t::second_type>(
-                    sg.shuffle_down(value[i].first, offset),
-                    sg.shuffle_down(value[i].second, offset));
+                    sycl::shift_group_left(sg, value[i].first, offset),
+                    sycl::shift_group_left(sg, value[i].second, offset));
         value[i] = ops.combine(value[i], other);
       }
     }
@@ -1028,7 +1030,8 @@ struct ReduceOp {
 
   // Currently implemented for max of two outputs
   template <class T1, class T2>
-  void set_results(const std::pair<T1, T2> x, const index_t base_offset) const {
+  void set_results(const at::xpu::pair<T1, T2> x, const index_t base_offset)
+      const {
     if (noutputs >= 1) {
       auto res0 = (T1*)((char*)dst[0] + base_offset);
       *res0 = x.first;
@@ -1121,9 +1124,10 @@ struct ReduceOp {
           decltype(combine),
           output_vec_size>(pos, shared_memory, value, combine);
       if (config.should_group_x_reduce()) {
-        // TODO: workaround because sg.shuffle_down will fail on `half` dtype.
+        // TODO: workaround because sycl::shift_group_left will fail on `half`
+        // dtype.
         constexpr bool is_pair =
-            std::is_same<std::pair<scalar_t, int64_t>, arg_t>::value;
+            std::is_same<at::xpu::pair<scalar_t, int64_t>, arg_t>::value;
         if constexpr (is_pair) {
           value = group_x_reduce_for_compound_dtype<output_vec_size>(
               pos, value, shared_memory);

diff --git a/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceAMinMaxKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at::native::xpu {
 
@@ -12,7 +12,7 @@ void _min_max_values_kernel_xpu_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
       iter,
       MinMaxOps<scalar_t, scalar_t, int32_t>{},
-      std::pair<scalar_t, scalar_t>(
+      at::xpu::pair<scalar_t, scalar_t>(
           at::numeric_limits<scalar_t>::upper_bound(),
           at::numeric_limits<scalar_t>::lower_bound()));
 }

diff --git a/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMaxKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at {
 namespace native {
@@ -14,7 +14,8 @@ void argmax_kernel_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, int64_t>(
       iter,
       ArgMaxOps<acc_t>{},
-      std::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::lower_bound(), 0));
+      at::xpu::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::lower_bound(), 0));
 };
 
 void argmax_kernel(TensorIterator& iter) {

diff --git a/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp b/src/ATen/native/xpu/sycl/ReduceArgMinKernel.cpp
@@ -1,9 +1,9 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at::native::xpu {
 
@@ -12,7 +12,8 @@ void argmin_kernel_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, int64_t>(
       iter,
       ArgMinOps<acc_t>{},
-      std::pair<acc_t, int64_t>(at::numeric_limits<acc_t>::upper_bound(), 0));
+      at::xpu::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::upper_bound(), 0));
 };
 
 void argmin_kernel(TensorIterator& iter) {

diff --git a/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.cpp
@@ -1,7 +1,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
@@ -38,7 +38,7 @@ void max_kernel(TensorIterator& iter) {
         gpu_reduce_kernel<scalar_t, scalar_t>(
             iter,
             MaxOps<scalar_t>{},
-            std::pair<scalar_t, int64_t>(
+            at::xpu::pair<scalar_t, int64_t>(
                 at::numeric_limits<scalar_t>::lower_bound(), 0));
       });
 }

diff --git a/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.cpp
@@ -1,6 +1,6 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
@@ -37,7 +37,7 @@ void min_kernel(TensorIterator& iter) {
         gpu_reduce_kernel<scalar_t, scalar_t>(
             iter,
             MinOps<scalar_t>{},
-            std::pair<scalar_t, int64_t>(
+            at::xpu::pair<scalar_t, int64_t>(
                 at::numeric_limits<scalar_t>::upper_bound(), 0));
       });
 }

diff --git a/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp b/src/ATen/native/xpu/sycl/ReduceMomentKernels.cpp
@@ -1,9 +1,9 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/NumericLimits.h>
 #include <ATen/native/xpu/sycl/Reduce.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 namespace at {
 namespace native {
@@ -18,7 +18,7 @@ void std_var_template(
   // This is necessary to lower register usage that leads to register spills.
   using accscalar_t = at::acc_type_device<scalar_t, kXPU>;
   using ops_t =
-      WelfordOps<scalar_t, accscalar_t, int32_t, std::pair<out_t, out_t>>;
+      WelfordOps<scalar_t, accscalar_t, int32_t, at::xpu::pair<out_t, out_t>>;
   ops_t ops(static_cast<accscalar_t>(correction_opt), take_sqrt);
   gpu_reduce_kernel<scalar_t, out_t, 2>(iter, ops, typename ops_t::acc_t{});
 }

diff --git a/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp b/src/ATen/native/xpu/sycl/ReduceNormKernel.cpp
@@ -1,5 +1,5 @@
 #include <ATen/Dispatch.h>
-#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/xpu/sycl/SharedReduceOps.h>
 
 #include <ATen/native/xpu/sycl/Reduce.h>