Skip to content

Commit

Permalink
Clear the FieldCache and any comms buffers when changing communicator…
Browse files Browse the repository at this point in the history
…s; this fixes UB that caused non-reproducible hangs when testing split grid. Also adds comm_barrier_global(), a global barrier regardless of the present communicator scope
  • Loading branch information
maddyscientist committed Dec 14, 2024
1 parent f40868b commit 1455aae
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
11 changes: 11 additions & 0 deletions include/comm_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,18 @@ namespace quda
*/
void comm_broadcast(void *data, size_t nbytes, int root = 0);

/**
@brief Multi-process barrier that applies to the present
communicator
*/
void comm_barrier(void);

/**
@brief Multi-process barrier that is global regardless of the
present communicator
*/
void comm_barrier_global(void);

void comm_abort(int status);
void comm_abort_(int status);

Expand Down
9 changes: 7 additions & 2 deletions lib/communicator_stack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,17 @@ namespace quda
// used to store the size of the tunecache at the point of splitting
static size_t tune_cache_size = 0;

// destroy any message handles associate with the prior communicator
LatticeField::freeGhostBuffer();
ColorSpinorField::freeGhostBuffer();
FieldTmp<ColorSpinorField>::destroy();

auto search = communicator_stack.find(split_key);
if (search == communicator_stack.end()) {
communicator_stack.emplace(std::piecewise_construct, std::forward_as_tuple(split_key),
std::forward_as_tuple(get_default_communicator(), split_key.data()));
}

LatticeField::freeGhostBuffer(); // Destroy the (IPC) Comm buffers with the old communicator.

auto split_key_old = current_key;
current_key = split_key;

Expand Down Expand Up @@ -362,6 +365,8 @@ namespace quda

void comm_barrier(void) { get_current_communicator().comm_barrier(); }

void comm_barrier_global(void) { get_default_communicator().comm_barrier(); }

void comm_abort_(int status) { Communicator::comm_abort_(status); };

int commDim(int dim) { return get_current_communicator().commDim(dim); }
Expand Down

0 comments on commit 1455aae

Please sign in to comment.