diff --git a/libdist/include/galois/DTerminationDetector.h b/libdist/include/galois/DTerminationDetector.h index f337978e60..200388f509 100644 --- a/libdist/include/galois/DTerminationDetector.h +++ b/libdist/include/galois/DTerminationDetector.h @@ -69,7 +69,7 @@ class DGTerminator { void reinitialize() { prev_snapshot = 0; - snapshot = 1; + snapshot = 1; global_snapshot = 1; work_done = false; } @@ -184,6 +184,11 @@ class DGTerminator { initiate_snapshot(); } else { galois::gDebug("[", net.ID, "] terminating ", snapshot); + // an explicit barrier may be required here + // so that the next async phase begins on all hosts at the same time + // however, this may add overheads when it is not required + // (depending on when the next async phase actually begins), so + // ASSUME: caller will call getHostBarrier().wait() if required reinitialize(); // for next async phase return true; } diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp index a2d34497af..e34c1db611 100644 --- a/libdist/src/DistStats.cpp +++ b/libdist/src/DistStats.cpp @@ -284,8 +284,9 @@ void DistStatManager::combineAtHost_0(void) { receiveAtHost_0_helper(); } }; - - galois::DGTerminator td2; + // explicit barrier after logical barrier is required + // as next async phase begins immediately + getHostBarrier().wait(); // host 0 reads stats from Base class // other hosts send stats to host 0 @@ -293,12 +294,15 @@ void DistStatManager::combineAtHost_0(void) { getSystemNetworkInterface().flush(); // barrier - while (td2.reduce()) { + while (td.reduce()) { if (getHostID() == 0) { // receive from other hosts receiveAtHost_0_helper2(); } }; + // explicit barrier after logical barrier is required + // as next async phase begins immediately + getHostBarrier().wait(); } bool DistStatManager::printingHostVals(void) { diff --git a/libgalois/src/PerThreadStorage.cpp b/libgalois/src/PerThreadStorage.cpp index 0f9abd14c6..80c98e7626 100644 --- a/libgalois/src/PerThreadStorage.cpp +++ b/libgalois/src/PerThreadStorage.cpp @@ -18,8 +18,8 @@ */ #include "galois/substrate/PerThreadStorage.h" +#include "galois/substrate/PageAlloc.h" -//#include "galois/runtime/Mem.h" #include "galois/gIO.h" #include @@ -37,17 +37,15 @@ galois::substrate::PerBackend& galois::substrate::getPPSBackend() { return b; } -#define MORE_MEM_HACK -#ifdef MORE_MEM_HACK -const size_t allocSize = - 16 * (2 << 20); // galois::runtime::MM::hugePageSize * 16; -inline void* alloc() { return malloc(allocSize); } - -#else -const size_t allocSize = galois::runtime::MM::hugePageSize; -inline void* alloc() { return galois::substrate::MM::pageAlloc(); } -#endif -#undef MORE_MEM_HACK +const size_t ptAllocSize = galois::substrate::allocSize(); +inline void* alloc() { + // alloc a single page, don't prefault + void* toReturn = galois::substrate::allocPages(1, true); + if (toReturn == nullptr) { + GALOIS_DIE("Out of memory in per thread storage allocation"); + } + return toReturn; +} unsigned galois::substrate::PerBackend::nextLog2(unsigned size) { unsigned i = MIN_SIZE; @@ -61,11 +59,11 @@ unsigned galois::substrate::PerBackend::nextLog2(unsigned size) { } unsigned galois::substrate::PerBackend::allocOffset(const unsigned sz) { - unsigned retval = allocSize; + unsigned retval = ptAllocSize; unsigned ll = nextLog2(sz); unsigned size = (1 << ll); - if ((nextLoc + size) <= allocSize) { + if ((nextLoc + size) <= ptAllocSize) { // simple path, where we allocate bump ptr style retval = __sync_fetch_and_add(&nextLoc, size); } else if (!invalid) { @@ -101,7 +99,7 @@ unsigned galois::substrate::PerBackend::allocOffset(const unsigned sz) { } } - assert(retval != allocSize); + assert(retval != ptAllocSize); return retval; } @@ -137,7 +135,7 @@ void galois::substrate::PerBackend::initCommon(unsigned maxT) { char* galois::substrate::PerBackend::initPerThread(unsigned maxT) { initCommon(maxT); char* b = heads[ThreadPool::getTID()] = (char*)alloc(); - memset(b, 0, allocSize); + memset(b, 0, ptAllocSize); return b; } @@ -147,7 +145,7 @@ char* galois::substrate::PerBackend::initPerSocket(unsigned maxT) { unsigned leader = ThreadPool::getLeader(); if (id == leader) { char* b = heads[id] = (char*)alloc(); - memset(b, 0, allocSize); + memset(b, 0, ptAllocSize); return b; } else { // wait for leader to fix up socket diff --git a/lonestardist/CMakeLists.txt b/lonestardist/CMakeLists.txt index 421c114b22..8a0ebabe75 100644 --- a/lonestardist/CMakeLists.txt +++ b/lonestardist/CMakeLists.txt @@ -88,7 +88,7 @@ function(add_test_dist_and_verify app input type part N np) set(suffix "-${app}-${type}-${input}-${part}-${np}") if (EXISTS ${output}) add_test(run${suffix} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part} -verify) - add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${OUTPUT} output_${HOSTNAME}_*.log) + add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${output} output_${HOSTNAME}_*.log) else() add_test(run-${app}-${type}-${input}-${part}-${np} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part}) endif() @@ -98,9 +98,6 @@ function(add_test_dist_and_verify app input type part N np) endif() - # Don't run dist tests in CI since they are broken in too many cases right now. - return() - if (NOT ${X_NOT_QUICK}) set_tests_properties(run${suffix} PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick) @@ -108,6 +105,16 @@ function(add_test_dist_and_verify app input type part N np) endfunction() function(add_test_dist_for_partitions app input type num_threads num_gpus part) + # cut threads in system in half first + if (${num_threads} GREATER 1) + math(EXPR num_threads "${num_threads} / 2") + endif() + + # spawn at most 8 processes/use at most 8 threads during testing + if (${num_threads} GREATER 8) + set(num_threads 8) + endif() + set(partitions ${num_threads}) set(thr ${num_threads}) while (${thr} GREATER 1) diff --git a/lonestardist/bc/CMakeLists.txt b/lonestardist/bc/CMakeLists.txt index c152f1645f..4912a67e6b 100644 --- a/lonestardist/bc/CMakeLists.txt +++ b/lonestardist/bc/CMakeLists.txt @@ -1,6 +1,6 @@ app_dist(bc_level) -add_test_dist(bc_level rmat15 NO_ASYNC ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -singleSource) +add_test_dist(bc_level rmat15 NO_ASYNC ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4) app_dist(bc_mr NO_GPU) -add_test_dist(bc_mr rmat15 NO_ASYNC NO_GPU ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=128 -numRoundSources=32) +add_test_dist(bc_mr rmat15 NO_ASYNC NO_GPU ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4 -numRoundSources=4) #add_test_dist(bc_mr rmat15all NO_ASYNC NO_GPU NOT_QUICK ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numRoundSources=4096)