add shfl_up, shfl_down and shfl_xor methods

alpaka-group · Nov 6, 2023 · 09a947c · 09a947c
1 parent 81150c6
commit 09a947c
Show file tree

Hide file tree

Showing 4 changed files with 276 additions and 24 deletions.
diff --git a/include/alpaka/warp/Traits.hpp b/include/alpaka/warp/Traits.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -39,6 +39,18 @@ namespace alpaka::warp
         template<typename TWarp, typename TSfinae = void>
         struct Shfl;
 
+        //! The shfl up warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflUp;
+
+        //! The shfl down warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflDown;
+
+        //! The shfl xor warp swizzling trait.
+        template<typename TWarp, typename TSfinae = void>
+        struct ShflXor;
+
         //! The active mask trait.
         template<typename TWarp, typename TSfinae = void>
         struct Activemask;
@@ -162,7 +174,7 @@ namespace alpaka::warp
     //!     __shared__ int32_t values[warpsize];
     //!     values[threadIdx.x] = value;
     //!     __syncthreads();
-    //!     return values[(srcLane + width*floor(threadIdx.x/width))%width];
+    //!     return values[width*(threadIdx.x/width) + srcLane%width];
     //!
     //! However, it does not use shared memory.
     //!
@@ -182,19 +194,123 @@ namespace alpaka::warp
     //! \param  width   number of threads receiving a single value
     //! \return val from the thread index srcLane.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0)
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
     {
         using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
         return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
     }
 
-    //! shfl for float vals
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with lower ID relative to caller.
+    //! The lane ID is calculated by subtracting delta from the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return values[width*(threadIdx.x/width) + threadIdx.x%width - delta];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
+    //! delta)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   number of threads receiving a single value
+    //! \return val from the thread index lane ID.
     ALPAKA_NO_HOST_ACC_WARNING
-    template<typename TWarp>
-    ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0)
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
     {
         using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
-        return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
+        return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane with higher ID relative to caller.
+    //! The lane ID is calculated by adding delta from the caller’s lane ID.
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return values[width*(threadIdx.x/width) + threadIdx.x%width + delta];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
+    //! warpsize)
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   number of threads receiving a single value
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
+    }
+
+    //! Exchange data between threads within a warp.
+    //! It copies from a lane based on bitwise XOR of own lane ID.
+    //! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with delta
+    //!
+    //! Effectively executes:
+    //!
+    //!     __shared__ int32_t values[warpsize];
+    //!     values[threadIdx.x] = value;
+    //!     __syncthreads();
+    //!     return values[width*(threadIdx.x/width) + (threadIdx.x%width ^ delta)];
+    //!
+    //! However, it does not use shared memory.
+    //!
+    //! Notes:
+    //! * The programmer must ensure that all threads calling this
+    //!   function (and the srcLane) are executing the same line of code.
+    //!   In particular it is not portable to write if(a) {shfl} else {shfl}.
+    //!
+    //! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^delta])
+    //!
+    //! * Width must be a power of 2.
+    //!
+    //! \tparam TWarp   warp implementation type
+    //! \tparam T       value type
+    //! \param  warp    warp implementation
+    //! \param  value   value to broadcast
+    //! \param  offset  corresponds to the delta used to compute the lane ID
+    //! \param  width   number of threads receiving a single value
+    //! \return val from the thread index lane ID.
+    ALPAKA_NO_HOST_ACC_WARNING
+    template<typename TWarp, typename T>
+    ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t offset, std::int32_t width = 0)
+    {
+        using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
+        return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, offset, width ? width : getSize(warp));
     }
 } // namespace alpaka::warp
diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp
@@ -114,7 +114,6 @@ namespace alpaka::warp::trait
         static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
         {
             ALPAKA_ASSERT_OFFLOAD(width > 0);
-            ALPAKA_ASSERT_OFFLOAD(srcLane < width);
             ALPAKA_ASSERT_OFFLOAD(srcLane >= 0);
 
             /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
@@ -132,6 +131,68 @@ namespace alpaka::warp::trait
             return sycl::select_from_group(actual_group, value, src);
         }
     };
+
+    template<typename TDim>
+    struct ShflUp<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_up(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset, /* must be the same for all work-items in the group */
+            std::int32_t width)
+        {
+            std::int32_t offset_int = static_cast<std::int32_t>(offset);
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
+            auto const actual_group_id = actual_item_id / width;
+            auto const actual_src_id = actual_item_id - offset_int;
+            auto const src = actual_src_id >= actual_group_id * width
+                                 ? sycl::id<1>{static_cast<std::size_t>(actual_src_id)}
+                                 : sycl::id<1>{static_cast<std::size_t>(actual_item_id)};
+            return sycl::select_from_group(actual_group, value, src);
+        }
+    };
+
+    template<typename TDim>
+    struct ShflDown<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_down(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::uint32_t offset,
+            std::int32_t width)
+        {
+            std::int32_t offset_int = static_cast<std::int32_t>(offset);
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
+            auto const actual_group_id = actual_item_id / width;
+            auto const actual_src_id = actual_item_id + offset_int;
+            auto const src = actual_src_id < (actual_group_id + 1) * width
+                                 ? sycl::id<1>{static_cast<std::size_t>(actual_src_id)}
+                                 : sycl::id<1>{static_cast<std::size_t>(actual_item_id)};
+            return sycl::select_from_group(actual_group, value, src);
+        }
+    };
+
+    template<typename TDim>
+    struct ShflXor<warp::WarpGenericSycl<TDim>>
+    {
+        template<typename T>
+        static auto shfl_xor(
+            warp::WarpGenericSycl<TDim> const& warp,
+            T value,
+            std::int32_t offset,
+            std::int32_t /*width*/)
+        {
+            auto const actual_group = warp.m_item_warp.get_sub_group();
+            auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
+            auto const actual_src_id = actual_item_id ^ offset;
+            auto const src = sycl::id<1>{static_cast<std::size_t>(actual_src_id)};
+            return sycl::select_from_group(actual_group, value, src);
+        }
+    };
 } // namespace alpaka::warp::trait
 
 #endif
diff --git a/include/alpaka/warp/WarpSingleThread.hpp b/include/alpaka/warp/WarpSingleThread.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -65,18 +65,52 @@ namespace alpaka::warp
         template<>
         struct Shfl<WarpSingleThread>
         {
+            template<typename T>
             static auto shfl(
                 warp::WarpSingleThread const& /*warp*/,
-                std::int32_t val,
+                T val,
                 std::int32_t /*srcLane*/,
                 std::int32_t /*width*/)
             {
                 return val;
             }
+        };
 
-            static auto shfl(
+        template<>
+        struct ShflUp<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_up(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_down(
+                warp::WarpSingleThread const& /*warp*/,
+                T val,
+                std::uint32_t /*srcLane*/,
+                std::int32_t /*width*/)
+            {
+                return val;
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpSingleThread>
+        {
+            template<typename T>
+            static auto shfl_xor(
                 warp::WarpSingleThread const& /*warp*/,
-                float val,
+                T val,
                 std::int32_t /*srcLane*/,
                 std::int32_t /*width*/)
             {

diff --git a/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp b/include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
+/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
  * SPDX-License-Identifier: MPL-2.0
  */
 
@@ -113,34 +113,75 @@ namespace alpaka::warp
         template<>
         struct Shfl<WarpUniformCudaHipBuiltIn>
         {
-            //-------------------------------------------------------------
+            template<typename T>
             __device__ static auto shfl(
                 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                float val,
+                T val,
                 int srcLane,
-                std::int32_t width) -> float
+                std::int32_t width) -> T
             {
 #        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
                 return __shfl_sync(activemask(warp), val, srcLane, width);
 #        else
                 return __shfl(val, srcLane, width);
 #        endif
             }
+        };
 
-            //-------------------------------------------------------------
-            __device__ static auto shfl(
+        template<>
+        struct ShflUp<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_up(
                 [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
-                std::int32_t val,
-                int srcLane,
-                std::int32_t width) -> std::int32_t
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
             {
 #        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-                return __shfl_sync(activemask(warp), val, srcLane, width);
+                return __shfl_up_sync(activemask(warp), val, offset, width);
 #        else
-                return __shfl(val, srcLane, width);
+                return __shfl_up(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflDown<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_down(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::uint32_t offset,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_down_sync(activemask(warp), val, offset, width);
+#        else
+                return __shfl_down(val, offset, width);
+#        endif
+            }
+        };
+
+        template<>
+        struct ShflXor<WarpUniformCudaHipBuiltIn>
+        {
+            template<typename T>
+            __device__ static auto shfl_xor(
+                [[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
+                T val,
+                std::int32_t offset,
+                std::int32_t width) -> T
+            {
+#        if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
+                return __shfl_xor_sync(activemask(warp), val, offset, width);
+#        else
+                return __shfl_xor(val, offset, width);
 #        endif
             }
         };
+
     } // namespace trait
 #    endif
 } // namespace alpaka::warp