Skip to content

Commit

Permalink
Implement grid sync and cooperative kernel functionality for Intel TBB
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVarvarin committed Nov 16, 2024
1 parent 1222309 commit f2af59a
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 5 deletions.
38 changes: 37 additions & 1 deletion include/alpaka/acc/AccCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "alpaka/block/shared/st/BlockSharedMemStMember.hpp"
#include "alpaka/block/sync/BlockSyncNoOp.hpp"
#include "alpaka/core/DemangleTypeNames.hpp"
#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
#include "alpaka/idx/bt/IdxBtZero.hpp"
#include "alpaka/idx/gb/IdxGbRef.hpp"
#include "alpaka/intrinsic/IntrinsicCpu.hpp"
Expand Down Expand Up @@ -62,6 +63,7 @@ namespace alpaka
, public BlockSharedMemDynMember<>
, public BlockSharedMemStMember<>
, public BlockSyncNoOp
, public GridSyncBarrierTbb<TIdx>
, public IntrinsicCpu
, public MemFenceCpu
# ifdef ALPAKA_DISABLE_VENDOR_RNG
Expand Down Expand Up @@ -94,6 +96,7 @@ namespace alpaka
, BlockSharedMemDynMember<>(blockSharedMemDynSizeBytes)
, BlockSharedMemStMember<>(staticMemBegin(), staticMemCapacity())
, m_gridBlockIdx(Vec<TDim, TIdx>::zeros())
, GridSyncBarrierTbb<TIdx>(getWorkDiv<Grid, Threads>(workDiv).prod())
{
}

Expand Down Expand Up @@ -148,7 +151,7 @@ namespace alpaka
// m_globalMemSizeBytes
getMemBytes(dev),
// m_cooperativeLaunch
false};
true};
}
};

Expand Down Expand Up @@ -199,6 +202,39 @@ namespace alpaka
}
};

//! The CPU TBB block accelerator execution cooperative task type trait specialization.
template<typename TDim, typename TIdx, typename TWorkDiv, typename TKernelFnObj, typename... TArgs>
struct CreateTaskCooperativeKernel<AccCpuTbbBlocks<TDim, TIdx>, TWorkDiv, TKernelFnObj, TArgs...>
{
ALPAKA_FN_HOST static auto createTaskCooperativeKernel(
TWorkDiv const& workDiv,
TKernelFnObj const& kernelFnObj,
TArgs&&... args)
{
if(workDiv.m_blockThreadExtent.prod() != static_cast<TIdx>(1u))
{
throw std::runtime_error(
"The given work division is not valid for a single thread Acc: "
+ getAccName<AccCpuTbbBlocks<TDim, TIdx>>() + ". Threads per block should be 1!");
}
auto const gridBlockExtent = getWorkDiv<Grid, Blocks>(workDiv);
auto const maxBlocks = tbb::this_task_arena::max_concurrency();
if(gridBlockExtent.prod() > static_cast<TIdx>(maxBlocks))
{
throw std::runtime_error(
"The number of requested blocks is larger than maximuma of the device for TBB "
"accelerator. Requested: "
+ std::to_string(gridBlockExtent.prod()) + ", maximum allowed: " + std::to_string(maxBlocks)
+ ". Use getMaxActiveBlocks().");
}

return TaskKernelCpuTbbBlocks<TDim, TIdx, TKernelFnObj, TArgs...>(
workDiv,
kernelFnObj,
std::forward<TArgs>(args)...);
}
};

//! The CPU TBB block execution task platform type trait specialization.
template<typename TDim, typename TIdx>
struct PlatformType<AccCpuTbbBlocks<TDim, TIdx>>
Expand Down
2 changes: 2 additions & 0 deletions include/alpaka/alpaka.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#include "alpaka/core/Align.hpp"
#include "alpaka/core/AlignedAlloc.hpp"
#include "alpaka/core/Assert.hpp"
#include "alpaka/core/BarrierTbb.h"
#include "alpaka/core/BarrierThread.hpp"
#include "alpaka/core/BoostPredef.hpp"
#include "alpaka/core/ClipCast.hpp"
Expand Down Expand Up @@ -108,6 +109,7 @@
// grid
#include "alpaka/grid/GridSyncBarrierCpuOmp.hpp"
#include "alpaka/grid/GridSyncBarrierCpuThread.hpp"
#include "alpaka/grid/GridSyncCpuTbbBlocks.hpp"
#include "alpaka/grid/GridSyncGpuCudaHip.hpp"
#include "alpaka/grid/GridSyncNoOp.hpp"
#include "alpaka/grid/Traits.hpp"
Expand Down
92 changes: 92 additions & 0 deletions include/alpaka/core/BarrierTbb.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

// Comment this out to switch to tbb::task::suspend implementation. It utilizes sleep, instead of properly waiting
#define ALPAKA_TBB_BARRIER_USE_MUTEX

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED

#include "alpaka/grid/Traits.hpp"
#include "alpaka/core/Common.hpp"

#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
# include <condition_variable>
# include <mutex>
#else
# include <oneapi/tbb/task.h>
# include <atomic>
#endif

namespace alpaka::core
{
namespace tbb
{
//! A self-resetting barrier.
template<typename TIdx>
class BarrierThread final
{
public:
explicit BarrierThread(TIdx const& threadCount)
: m_threadCount(threadCount)
, m_curThreadCount(threadCount)
, m_generation(0)
{
}

//! Waits for all the other threads to reach the barrier.
auto wait() -> void
{
TIdx const generationWhenEnteredTheWait = m_generation;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
std::unique_lock<std::mutex> lock(m_mtxBarrier);
#endif
if(--m_curThreadCount == 0)
{
m_curThreadCount = m_threadCount;
++m_generation;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
m_cvAllThreadsReachedBarrier.notify_all();
#endif
}
else
{
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
m_cvAllThreadsReachedBarrier.wait(
lock,
[this, generationWhenEnteredTheWait] { return generationWhenEnteredTheWait != m_generation; });
#else
oneapi::tbb::task::suspend([&generationWhenEnteredTheWait, this] (oneapi::tbb::task::suspend_point tag)
{
while(generationWhenEnteredTheWait == this->m_generation)
{
//sleep for 100 microseconds
usleep(100);
}
oneapi::tbb::task::resume(tag);
});
#endif
}
}

private:
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
std::mutex m_mtxBarrier;
std::condition_variable m_cvAllThreadsReachedBarrier;
#endif
const TIdx m_threadCount;
#ifdef ALPAKA_TBB_BARRIER_USE_MUTEX
TIdx m_curThreadCount;
TIdx m_generation;
#else
std::atomic<TIdx> m_curThreadCount;
std::atomic<TIdx> m_generation;
oneapi::tbb::task::suspend_point m_tag;
#endif
};
} // namespace tbb
} // namespace alpaka::core

#endif
43 changes: 43 additions & 0 deletions include/alpaka/grid/GridSyncCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* Copyright 2024 Mykhailo Varvarin
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

#include "alpaka/core/Common.hpp"
#include "alpaka/grid/Traits.hpp"

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
# include "alpaka/core/BarrierTbb.h"

namespace alpaka
{
//! The thread id map barrier grid synchronization for TBB.
template<typename TIdx>
class GridSyncBarrierTbb : public interface::Implements<ConceptGridSync, GridSyncBarrierTbb<TIdx>>
{
public:
using Barrier = core::tbb::BarrierThread<TIdx>;

ALPAKA_FN_HOST explicit GridSyncBarrierTbb(TIdx const& gridThreadCount) : m_barrier(gridThreadCount)
{
}

Barrier mutable m_barrier;
};

namespace trait
{
template<typename TIdx>
struct SyncGridThreads<GridSyncBarrierTbb<TIdx>>
{
ALPAKA_FN_HOST static auto syncGridThreads(GridSyncBarrierTbb<TIdx> const& gridSync) -> void
{
gridSync.m_barrier.wait();
}
};

} // namespace trait
} // namespace alpaka

#endif
24 changes: 20 additions & 4 deletions include/alpaka/kernel/TaskKernelCpuTbbBlocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@ namespace alpaka
tbb::this_task_arena::isolate(
[&]
{
AccCpuTbbBlocks<TDim, TIdx> acc(
*static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
blockSharedMemDynSizeBytes);

tbb::parallel_for(
static_cast<TIdx>(0),
static_cast<TIdx>(numBlocksInGrid),
[&](TIdx i)
{
AccCpuTbbBlocks<TDim, TIdx> acc(
*static_cast<WorkDivMembers<TDim, TIdx> const*>(this),
blockSharedMemDynSizeBytes);

acc.m_gridBlockIdx
= mapIdx<TDim::value>(Vec<DimInt<1u>, TIdx>(static_cast<TIdx>(i)), gridBlockExtent);

Expand Down Expand Up @@ -177,6 +177,22 @@ namespace alpaka
return kernelFunctionAttributes;
}
};

//! The CPU CPU OMP2 blocks get max active blocks for cooperative kernel specialization.
template<typename TDev, typename TKernelFnObj, typename TDim, typename TIdx, typename... TArgs>
struct MaxActiveBlocks<AccCpuTbbBlocks<TDim, TIdx>, TDev, TKernelFnObj, TDim, TIdx, TArgs...>
{
ALPAKA_FN_HOST static auto getMaxActiveBlocks(
TKernelFnObj const& /*kernelFnObj*/,
TDev const& device,
alpaka::Vec<TDim, TIdx> const& /*blockThreadExtent*/,
alpaka::Vec<TDim, TIdx> const& /*threadElemExtent*/,
TArgs const&... /*args*/) -> int
{
return static_cast<int>(
trait::GetAccDevProps<AccCpuTbbBlocks<TDim, TIdx>>::getAccDevProps(device).m_multiProcessorCount);
}
};
} // namespace trait
} // namespace alpaka

Expand Down

0 comments on commit f2af59a

Please sign in to comment.