Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tjf hiearchical unpack #4

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ getTargetRowMapIndices(const LO lclNumRows,
TEUCHOS_ASSERT(gblRow < indexBase + gblNumRows);
tgtGids[lid] = gblRow;
}
return std::move(tgtGids);
// The original return using std::move (commented out below) returns the
// following warning with gcc 9.2.0:
// waring: moving a local object in a return statement prevents copy elision [-Wpessimizing-move]
//return std::move(tgtGids);
return tgtGids;
}

RCP<const map_type>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,15 +252,15 @@ getTpetraCrsMatrix (Teuchos::FancyOStream& out,
using Teuchos::rcp;
using std::endl;
using matrix_type = Tpetra::CrsMatrix<>;
using device_type = matrix_type::device_type;
//using device_type = matrix_type::device_type;
using SC = matrix_type::impl_scalar_type;
using KAT = Kokkos::ArithTraits<SC>;
//using KAT = Kokkos::ArithTraits<SC>;
using LO = Tpetra::Map<>::local_ordinal_type;
using host_device_type = Kokkos::View<SC*, Kokkos::LayoutRight, device_type>::host_mirror_space;
using host_execution_space = host_device_type::execution_space;
//using host_device_type = Kokkos::View<SC*, Kokkos::LayoutRight, device_type>::host_mirror_space;
//using host_execution_space = host_device_type::execution_space;

// We're filling on the host, so generate random numbers on the host.
using pool_type = Kokkos::Random_XorShift64_Pool<host_execution_space>;
//using pool_type = Kokkos::Random_XorShift64_Pool<host_execution_space>;

Teuchos::OSTab tab0 (out);
out << "Create CrsMatrix for benchmark" << endl;
Expand Down
130 changes: 81 additions & 49 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4733,6 +4733,9 @@ namespace Tpetra {
os << *prefix << endl;
std::cerr << os.str ();
}
Details::ProfilingRegion region(
"Tpetra::CrsMatrix::fillCompete",
"fillCompete");

TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(! this->isFillActive () || this->isFillComplete (), std::runtime_error,
Expand All @@ -4743,54 +4746,57 @@ namespace Tpetra {
//
// Read parameters from the input ParameterList.
//

// If true, the caller promises that no process did nonlocal
// changes since the last call to fillComplete.
bool assertNoNonlocalInserts = false;
// If true, makeColMap sorts remote GIDs (within each remote
// process' group).
bool sortGhosts = true;

if (! params.is_null ()) {
assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
assertNoNonlocalInserts);
if (params->isParameter ("sort column map ghost gids")) {
sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
}
else if (params->isParameter ("Sort column Map ghost GIDs")) {
sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
}
}
// We also don't need to do global assembly if there is only one
// process in the communicator.
const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
// This parameter only matters if this matrix owns its graph.
if (! this->myGraph_.is_null ()) {
this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
}

if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
if (this->hasColMap ()) { // use local indices
allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
{
Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");

// If true, the caller promises that no process did nonlocal
// changes since the last call to fillComplete.
bool assertNoNonlocalInserts = false;
// If true, makeColMap sorts remote GIDs (within each remote
// process' group).
bool sortGhosts = true;

if (! params.is_null ()) {
assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
assertNoNonlocalInserts);
if (params->isParameter ("sort column map ghost gids")) {
sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
}
else if (params->isParameter ("Sort column Map ghost GIDs")) {
sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
}
}
// We also don't need to do global assembly if there is only one
// process in the communicator.
const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
// This parameter only matters if this matrix owns its graph.
if (! this->myGraph_.is_null ()) {
this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
}

if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
if (this->hasColMap ()) { // use local indices
allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
}
else { // no column Map, so use global indices
allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
}
}
// Global assemble, if we need to. This call only costs a single
// all-reduce if we didn't need global assembly after all.
if (needGlobalAssemble) {
this->globalAssemble ();
}
else { // no column Map, so use global indices
allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
else {
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(numProcs == 1 && nonlocals_.size() > 0,
std::runtime_error, "Cannot have nonlocal entries on a serial run. "
"An invalid entry (i.e., with row index not in the row Map) must have "
"been submitted to the CrsMatrix.");
}
}
// Global assemble, if we need to. This call only costs a single
// all-reduce if we didn't need global assembly after all.
if (needGlobalAssemble) {
this->globalAssemble ();
}
else {
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(numProcs == 1 && nonlocals_.size() > 0,
std::runtime_error, "Cannot have nonlocal entries on a serial run. "
"An invalid entry (i.e., with row index not in the row Map) must have "
"been submitted to the CrsMatrix.");
}

if (this->isStaticGraph ()) {
Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
// FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
// checks below only in debug mode. It would be nicer to do a
// local check, then propagate the error state in a deferred
Expand Down Expand Up @@ -4840,6 +4846,7 @@ namespace Tpetra {
this->fillLocalMatrix (params);
}
else {
Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
// Set the graph's domain and range Maps. This will clear the
// Import if the domain Map has changed (is a different
// pointer), and the Export if the range Map has changed (is a
Expand Down Expand Up @@ -4892,16 +4899,26 @@ namespace Tpetra {
this->myGraph_->checkInternalState ();
}

const bool callComputeGlobalConstants = params.get () == nullptr ||
params->get ("compute global constants", true);
if (callComputeGlobalConstants) {
this->computeGlobalConstants ();
{
Details::ProfilingRegion region_ccgc(
"Tpetra::CrsMatrix::fillCompete", "callComputeGlobalConstamnts"
);
const bool callComputeGlobalConstants = params.get () == nullptr ||
params->get ("compute global constants", true);
if (callComputeGlobalConstants) {
this->computeGlobalConstants ();
}
}

// FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.

this->fillComplete_ = true; // Now we're fill complete!
this->checkInternalState ();
{
Details::ProfilingRegion region_cis(
"Tpetra::CrsMatrix::fillCompete", "checkInternalState"
);
this->checkInternalState ();
}
}

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
Expand Down Expand Up @@ -7284,6 +7301,11 @@ namespace Tpetra {
typedef GlobalOrdinal GO;
typedef impl_scalar_type ST;

Details::ProfilingRegion region_upack_row(
"Tpetra::CrsMatrix::unpackRow",
"Import/Export"
);

if (numBytes == 0) {
// Rows with zero bytes should always have zero entries.
if (numEnt != 0) {
Expand Down Expand Up @@ -7475,6 +7497,7 @@ namespace Tpetra {
Distributor& dist) const
{
// The call to packNew in packAndPrepare catches and handles any exceptions.
Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
if (this->isStaticGraph ()) {
using ::Tpetra::Details::packCrsMatrixNew;
packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
Expand Down Expand Up @@ -7902,6 +7925,10 @@ namespace Tpetra {
const CombineMode combineMode,
const bool verbose)
{
Details::ProfilingRegion region_unpack_and_combine_impl(
"Tpetra::CrsMatrix::unpackAndCombineImpl",
"Import/Export"
);
using std::endl;
const char tfecfFuncName[] = "unpackAndCombineImpl";
std::unique_ptr<std::string> prefix;
Expand Down Expand Up @@ -8019,6 +8046,11 @@ namespace Tpetra {
return; // nothing to do; no need to combine entries
}

Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
"Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
"Import/Export"
);

// We're unpacking on host. This is read-only host access.
if (imports.need_sync_host()) {
imports.sync_host ();
Expand Down
79 changes: 71 additions & 8 deletions packages/tpetra/core/src/Tpetra_Details_Behavior.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ namespace Details {
namespace BehaviorDetails {
std::map<std::string, std::map<std::string, bool> > namedVariableMap_;
bool verboseDisabled_ = false;
bool timingDisabled_ = false;
}

namespace { // (anonymous)
Expand Down Expand Up @@ -188,12 +189,16 @@ namespace { // (anonymous)
else {
// This could throw invalid_argument or out_of_range.
// Go ahead and let it do so.
const long long val = std::stoll(stringToUpper(varVal));
TEUCHOS_TEST_FOR_EXCEPTION
(val < static_cast<long long>(0), std::out_of_range,
prefix << "Environment variable \""
<< environmentVariableName << "\" is supposed to be a size, "
"but it has a negative integer value " << val << ".");
long long val = std::stoll(stringToUpper(varVal));
if (val < static_cast<long long>(0)) {
// If negative - user has requested threshold be lifted
return std::numeric_limits<size_t>::max();
}
// TEUCHOS_TEST_FOR_EXCEPTION
// (val < static_cast<long long>(0), std::out_of_range,
// prefix << "Environment variable \""
// << environmentVariableName << "\" is supposed to be a size, "
// "but it has a negative integer value " << val << ".");
if (sizeof(long long) > sizeof(size_t)) {
// It's hard to test this code, but I want to try writing it
// at least, in case we ever have to run on 32-bit machines or
Expand Down Expand Up @@ -271,6 +276,10 @@ namespace { // (anonymous)
return false;
}

constexpr bool timingDefault () {
return false;
}

constexpr bool assumeMpiIsCudaAwareDefault () {
#ifdef TPETRA_ASSUME_CUDA_AWARE_MPI
return true;
Expand All @@ -279,6 +288,10 @@ namespace { // (anonymous)
#endif // TPETRA_ASSUME_CUDA_AWARE_MPI
}

constexpr bool hierarchicalUnpackDefault () {
return true;
}

} // namespace (anonymous)

bool Behavior::debug ()
Expand Down Expand Up @@ -309,6 +322,21 @@ bool Behavior::verbose ()
defaultValue);
}

bool Behavior::timing ()
{
if (BehaviorDetails::timingDisabled_) return false;

constexpr char envVarName[] = "TPETRA_TIMING";
constexpr bool defaultValue = timingDefault ();

static bool value_ = defaultValue;
static bool initialized_ = false;
return idempotentlyGetEnvironmentVariableAsBool (value_,
initialized_,
envVarName,
defaultValue);
}

bool Behavior::assumeMpiIsCudaAware ()
{
constexpr char envVarName[] = "TPETRA_ASSUME_CUDA_AWARE_MPI";
Expand Down Expand Up @@ -369,7 +397,7 @@ size_t Behavior::multivectorKernelLocationThreshold ()
(value_, initialized_, envVarName, defaultValue);
}

bool Behavior::profilingRegionUseTeuchosTimers ()
bool Behavior::profilingRegionUseTeuchosTimers ()
{
constexpr char envVarName[] = "TPETRA_USE_TEUCHOS_TIMERS";
constexpr bool defaultValue(false);
Expand All @@ -380,7 +408,7 @@ bool Behavior::profilingRegionUseTeuchosTimers ()
(value_, initialized_, envVarName, defaultValue);
}

bool Behavior::profilingRegionUseKokkosProfiling ()
bool Behavior::profilingRegionUseKokkosProfiling ()
{
constexpr char envVarName[] = "TPETRA_USE_KOKKOS_PROFILING";
constexpr bool defaultValue(false);
Expand Down Expand Up @@ -426,6 +454,41 @@ void Behavior::disable_verbose_behavior () {
BehaviorDetails::verboseDisabled_ = true;
}

bool Behavior::timing (const char name[])
{
if (BehaviorDetails::timingDisabled_) return false;

constexpr char envVarName[] = "TPETRA_TIMING";
constexpr bool defaultValue = false;

static bool initialized_ = false;
return idempotentlyGetNamedEnvironmentVariableAsBool (name,
initialized_,
envVarName,
defaultValue);
}

void Behavior::enable_timing() {
BehaviorDetails::timingDisabled_ = false;
}

void Behavior::disable_timing() {
BehaviorDetails::timingDisabled_ = true;
}

bool Behavior::hierarchicalUnpack ()
{
constexpr char envVarName[] = "TPETRA_HIERARCHICAL_UNPACK";
constexpr bool defaultValue = hierarchicalUnpackDefault();

static bool value_ = defaultValue;
static bool initialized_ = false;
return idempotentlyGetEnvironmentVariableAsBool (value_,
initialized_,
envVarName,
defaultValue);
}

} // namespace Details
} // namespace Tpetra

Loading