Skip to content

Commit

Permalink
fix few bugs
Browse files Browse the repository at this point in the history
- fix double page clean-up()` was not taken into account
- fix that the page lock during `tryUsePage
  • Loading branch information
psychocoderHPC committed Jun 17, 2024
1 parent bffe2aa commit 58fe77a
Showing 1 changed file with 102 additions and 62 deletions.
164 changes: 102 additions & 62 deletions src/include/mallocMC/creationPolicies/Scatter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,12 @@ namespace mallocMC
// increse the fill level
const uint32 filllevel = alpaka::atomicOp<alpaka::AtomicAdd>(acc, (uint32*) &(_ptes[page].count), 1u);

// if resetfreedpages == false we do not need to re-check filllevel or chunksize
// if resetfreedpages == false we do not need to re-check chunksize
bool tryAllocMem = !resetfreedpages;

// note: if filllevel >= pagesize then page is currently freed by another thread
if constexpr(resetfreedpages)
if(filllevel < pagesize)
if(filllevel < pagesize)
{
if constexpr(resetfreedpages)
{
/* Re-check chunk size (it could be that the page got freed in the meanwhile...)
* Use atomic to guarantee that no other thread deleted the page and reinitialized
Expand All @@ -488,6 +488,12 @@ namespace mallocMC
if(oldChunksize != 0)
chunksize = oldChunksize;
}
}
else
{
// note: if filllevel >= pagesize then page is currently freed by another thread
tryAllocMem = false;
}

if(tryAllocMem)
{
Expand All @@ -512,7 +518,15 @@ namespace mallocMC

// this one is full or not useable
if(chunk_ptr == nullptr)
alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &(_ptes[page].count), 1u);
{
uint32_t oldFillLevel
= alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &(_ptes[page].count), 1u);
if(oldFillLevel == 1u)
{
// chunksize guaranteed to hold the chunksize
tryCleanPage(acc, page);
}
}

return chunk_ptr;
}
Expand Down Expand Up @@ -586,9 +600,8 @@ namespace mallocMC
0u,
minAllocation);
// Check if the chunk size can be used even if the size is not an exact match.
auto const isChunkSizeInRange = [&](uint32_t currentChunkSize) {
return currentChunkSize >= bytes && currentChunkSize <= maxchunksize;
};
auto const isChunkSizeInRange = [&](uint32_t currentChunkSize)
{ return currentChunkSize >= bytes && currentChunkSize <= maxchunksize; };
uint32_t useChunkSize = 0u;
if(beforeChunkSize == 0u)
{
Expand Down Expand Up @@ -636,6 +649,73 @@ namespace mallocMC
return nullptr;
}

/** tries to clean-up the page
*
* The last thread reducing the page count to zero should call this method.
*/
template<typename AlpakaAcc>
ALPAKA_FN_ACC void tryCleanPage(const AlpakaAcc& acc, uint32 page)
{
if constexpr(resetfreedpages)
{
/* Workaround for nvcc because the in class defined static constexpr variable can not be passed
* into functions taking a constant reference.
*/
constexpr auto pageSize = pagesize;
/* Try lock the PTE to cleanup the meta data.
* Only the last allocation within the PTE will be successfully lock the PTE.
* In case it is the last allocation on the page the new pagesize will signal full and nobody else
* is allowed to touch the meta data anymore.
*/
auto oldfilllevel
= alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[page].count, 0u, pageSize);

if(oldfilllevel == 0)
{
const uint32 chunksize
= alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u);
// if chunksize == 0 than another thread cleaned the page already
if(chunksize != 0)
{
// clean meta data bits on the PTE
_page[page].init();

/** Take care that the meta data changes where we did not use atomics are propagated to all
* other threads.
*/
threadfenceDevice(acc);
/* Remove chunk information.
* It is important that this call happened after page init is called because scatter malloc
* is updating the chunksize without notify the action by increasing the page count
* beforehand.
*/
auto oldChunkSize = alpaka::atomicOp<alpaka::AtomicCas>(
acc,
(uint32*) &_ptes[page].chunksize,
chunksize,
0u);
if(oldChunkSize != chunksize)
{
// The chunksize can only be changed if it was in between zero. Therefore this code
// should never be reached or we started this method with an outdated chunksize.
printf(
"%u != %u, %u unexpected behaviour during dealloction\n",
oldChunkSize,
chunksize,
page);
}
}
/* Unlock the PTE by reducing the counter.
* In case another allocation is at the same moment trying to allocate memory in tryUsePage()
* the counter can be larger then zero after this dealloc is reducing the counter, this is no
* problem because if the chunk size in tryUsaPage() is not fitting the counter is reduced an
* the page is marked as free.
*/
alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, pageSize);
}
}
}

/**
* deallocChunked frees the chunk on the page and updates all data
* accordingly
Expand Down Expand Up @@ -671,56 +751,10 @@ namespace mallocMC
alpaka::atomicOp<alpaka::AtomicAnd>(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment));
}

uint32 oldfilllevel = 0u;
if constexpr(resetfreedpages)
{
/* Workaround for nvcc because the in class defined static constexpr variable can not be passed
* into functions taking a constant reference.
*/
constexpr auto pageSize = pagesize;
/* Try lock the PTE to cleanup the meta data.
* Only the last allocation within the PTE will be successfully lock the PTE.
* In case it is the last allocation on the page the new pagesize will signal full and nobody else
* is allowed to touch the meta data anymore.
*/
oldfilllevel
= alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[page].count, 1u, pageSize);
if(oldfilllevel == 1)
{
// clean meta data bits on the PTE
_page[page].init();

// remove chunk information
alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[page].chunksize, chunksize, 0u);

/** Take care that the meta data changes where we did not use atomics are propagated to all
* other threads.
*
* @todo Moving this line above the chunk size reset will result into misaligned memory access
* on CUDA in seldom cases. It is not clear why :-(
*/
threadfenceDevice(acc);
uint32 oldfilllevel = alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, 1u);

/* Unlock the PTE by reducing the counter.
* In case another allocation is at the same moment trying to allocate memory in tryUsePage()
* the counter can be larger then zero after this dealloc is reducing the counter, this is no
* problem because if the chunk size in tryUsaPage() is not fitting the counter is reduced an
* the page is marked as free.
*/
alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, pageSize);
}
else
{
// Locking the page was not possible because there are still other allocations on the PTE.
oldfilllevel = alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, 1u);
}
}
else
{
// If we do not reset free pages we only need to reduce the counter, no need to clean the meta
// data.
oldfilllevel = alpaka::atomicOp<alpaka::AtomicSub>(acc, (uint32*) &_ptes[page].count, 1u);
}
if(oldfilllevel == 1u)
tryCleanPage(acc, page);

// meta information counters ... should not be changed by too
// many threads, so..
Expand Down Expand Up @@ -922,7 +956,11 @@ namespace mallocMC
return;
// lets see on which page we are on
const auto page = static_cast<uint32>(((char*) mem - (char*) _page) / pagesize);
const uint32 chunksize = _ptes[page].chunksize;
/* Emulate atomic read.
* In older implementations we read the chunksize without atomics which can result in data races.
*/
const uint32 chunksize
= alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u);

// is the pointer the beginning of a chunk?
const auto inpage_offset = static_cast<uint32>((char*) mem - _page[page].data);
Expand Down Expand Up @@ -1073,7 +1111,8 @@ namespace mallocMC
const AlpakaAcc& m_acc,
T_DeviceAllocator* m_heap,
void* m_heapmem,
size_t m_memsize) {
size_t m_memsize)
{
m_heap->pool = m_heapmem;
m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize);
};
Expand Down Expand Up @@ -1115,8 +1154,8 @@ namespace mallocMC
const uint32 filledChunks = _ptes[page].count;
if(chunksize <= HierarchyThreshold)
{
const uint32 segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 2nd-level
// chunks
const uint32 segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32
// 2nd-level chunks
const uint32 fullsegments = alpaka::math::min(
acc,
32u,
Expand Down Expand Up @@ -1246,7 +1285,8 @@ namespace mallocMC
const AlpakaAcc& acc,
T_DeviceAllocator* heapPtr,
size_t numBytes,
unsigned* slots) -> void {
unsigned* slots) -> void
{
const auto gid = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc).sum();

const auto nWorker = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc).prod();
Expand Down

0 comments on commit 58fe77a

Please sign in to comment.