From ca09ba99c6113e3a65ba205a3b70e0a8499fb17d Mon Sep 17 00:00:00 2001 From: srkenno Date: Tue, 17 Dec 2024 11:31:31 -0700 Subject: [PATCH 1/7] prepare for clean PR Signed-off-by: srkenno --- .../tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 20 + .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 323 +++++++++- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 594 ++++++++++++++++-- .../src/Tpetra_Details_WrappedDualView.hpp | 10 +- .../core/src/Tpetra_Details_crsUtils.hpp | 72 ++- packages/tpetra/core/src/Tpetra_Map_decl.hpp | 1 + packages/tpetra/core/src/Tpetra_Map_def.hpp | 40 +- 7 files changed, 975 insertions(+), 85 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 85f91bd676c2..01c1f3ac6d2b 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1194,6 +1194,26 @@ namespace Tpetra { buffer_device_type>& permuteFromLIDs, const CombineMode CM) override; + + void + insertGlobalIndicesDevice + (const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd); + + void + copyAndPermuteNew + (const row_graph_type& source, + row_graph_type& target, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM); + using padding_type = Details::CrsPadding< local_ordinal_type, global_ordinal_type>; diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 17fbcbfb9a5d..6fcf2700c066 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -964,6 +964,8 @@ namespace Tpetra { CrsGraph:: getLocalNumEntries () const { + Details::ProfilingRegion regionGLNE("Tpetra::CrsGraph::getLocalNumEntries"); + const char tfecfFuncName[] = "getLocalNumEntries: "; typedef LocalOrdinal LO; @@ -1185,7 +1187,6 @@ namespace Tpetra { CrsGraph:: allocateIndices (const ELocalGlobal lg, const bool verbose) { - using Details::ProfilingRegion; using Teuchos::arcp; using Teuchos::Array; using Teuchos::ArrayRCP; @@ -1196,7 +1197,7 @@ namespace Tpetra { const char tfecfFuncName[] = "allocateIndices: "; const char suffix[] = " Please report this bug to the Tpetra developers."; - ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); + Details::ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); std::unique_ptr prefix; if (verbose) { @@ -1593,6 +1594,8 @@ namespace Tpetra { typedef GlobalOrdinal GO; const char tfecfFuncName[] = "insertIndices: "; + Details::ProfilingRegion regionII("Tpetra::CrsGraph::insertIndices"); + size_t oldNumEnt = 0; if (debug_) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC @@ -1714,12 +1717,15 @@ namespace Tpetra { const char tfecfFuncName[] = "insertGlobalIndicesImpl: "; const LO lclRow = static_cast (rowInfo.localRow); + Details::ProfilingRegion regionIGII("Tpetra::CrsGraph::insertGlobalIndicesImpl"); + auto numEntries = rowInfo.numEntries; using inp_view_type = View; inp_view_type inputInds(inputGblColInds, numInputInds); size_t numInserted; { auto gblIndsHostView = this->gblInds_wdv.getHostView(Access::ReadWrite); + // FIXME - device numInserted = Details::insertCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), gblIndsHostView, numEntries, inputInds, fun); @@ -1776,6 +1782,8 @@ namespace Tpetra { using LO = LocalOrdinal; const char tfecfFuncName[] = "insertLocallIndicesImpl: "; + Details::ProfilingRegion regionILII("Tpetra::CrsGraph::insertLocallIndicesImpl"); + const RowInfo rowInfo = this->getRowInfo(myRow); size_t numNewInds = 0; @@ -1837,6 +1845,8 @@ namespace Tpetra { using Kokkos::MemoryUnmanaged; auto invalidCount = Teuchos::OrdinalTraits::invalid(); + Details::ProfilingRegion regionFGI("Tpetra::CrsGraph::findGlobalIndices"); + using inp_view_type = View; inp_view_type inputInds(indices.getRawPtr(), indices.size()); @@ -1847,10 +1857,18 @@ namespace Tpetra { if (this->colMap_.is_null()) return invalidCount; const auto& colMap = *(this->colMap_); + auto map = [&](GO const gblInd){return colMap.getLocalElement(gblInd);}; - numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), - rowInfo.numEntries, - lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + + if (this->isSorted()) { + numFound = Details::findCrsIndicesSorted(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } else { + numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } } else if (this->isGloballyIndexed()) { @@ -1861,7 +1879,6 @@ namespace Tpetra { return numFound; } - template size_t CrsGraph:: @@ -2313,6 +2330,8 @@ namespace Tpetra { using Teuchos::ArrayView; const char tfecfFuncName[] = "getGlobalRowCopy: "; + Details::ProfilingRegion regionGGRC("Tpetra::CrsGraph::getGlobalRowCopy"); + // This does the right thing (reports an empty row) if the input // row is invalid. const RowInfo rowinfo = getRowInfoFromGlobalRowIndex (globalRow); @@ -2324,17 +2343,15 @@ namespace Tpetra { numEntries = theNumEntries; // first side effect if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { + if (isLocallyIndexed ()) { auto lclInds = getLocalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = colMap_->getGlobalElement (lclInds(j)); - } + bool err = colMap_->getGlobalElements(lclInds.data(), theNumEntries, indices.data()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); } else if (isGloballyIndexed ()) { auto gblInds = getGlobalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = gblInds(j); - } + std::memcpy((void*)indices.data(), (const void*) gblInds.data(), theNumEntries*sizeof(*indices.data())); } } } @@ -2912,6 +2929,8 @@ namespace Tpetra { using size_type = typename Teuchos::Array::size_type; const char tfecfFuncName[] = "globalAssemble: "; // for exception macro + Details::ProfilingRegion regionGA("Tpetra::CrsGraph::globalAssemble"); + std::unique_ptr prefix; if (verbose_) { prefix = this->createPrefix("CrsGraph", "globalAssemble"); @@ -3163,6 +3182,8 @@ namespace Tpetra { const char tfecfFuncName[] = "fillComplete: "; const bool verbose = verbose_; + Details::ProfilingRegion regionFC("Tpetra::CrsGraph::fillComplete"); + std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "fillComplete"); @@ -3531,6 +3552,8 @@ namespace Tpetra { "expertStaticFillComplete): "; const size_t lclNumRows = this->getLocalNumRows (); + Details::ProfilingRegion regionFLG("Tpetra::CrsGraph::fillLocalGraph"); + // This method's goal is to fill in the two arrays (compressed // sparse row format) that define the sparse graph's structure. @@ -4805,6 +4828,14 @@ namespace Tpetra { const char tfecfFuncName[] = "copyAndPermute: "; const bool verbose = verbose_; + if (true) { + const row_graph_type& srcRowGraph = + dynamic_cast (source); + copyAndPermuteNew(srcRowGraph, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, INSERT); + return; + } + + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "copyAndPermute"); @@ -4831,6 +4862,7 @@ namespace Tpetra { } auto padding = computeCrsPadding(srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose); + applyCrsPadding(*padding, verbose); // If the source object is actually a CrsGraph, we can use view @@ -5137,6 +5169,7 @@ namespace Tpetra { std::vector tgtGblColIndsScratch; execute_sync_host_uvm_access(); // protect host UVM access + // FIXME parallel_for for (LO lclRowInd = 0; lclRowInd < numSameIDs; ++lclRowInd) { const GO srcGblRowInd = srcRowMap.getGlobalElement(lclRowInd); const GO tgtGblRowInd = tgtRowMap.getGlobalElement(lclRowInd); @@ -7576,6 +7609,272 @@ namespace Tpetra { return output; } + template + void + CrsGraph:: + insertGlobalIndicesDevice(const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd) + { + using crs_graph_type = CrsGraph; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; + typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); + + const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); + const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); + + using local_map_type = typename crs_graph_type::map_type::local_map_type; + local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); + local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); + local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); + + auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); + auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); + auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); + auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); + + typename crs_graph_type::num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; + + auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); + + const bool sorted = false; + + bool hasMap = permuteFromLIDs.extent(0) > 0; + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + +#ifdef CRSGRAPH_INNER_ABORT +#undef CRSGRAPH_INNER_ABORT +#endif + +#define CRSGRAPH_INNER_ABORT(lin) do { \ + printf("ERROR: Tpetra_CrsGraph_def.hpp:%d", lin); \ + Kokkos::abort("error"); \ + } while(0) + + Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, loopEnd), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto srcLid = sourceLID; + auto tgtLid = sourceLID; + if (hasMap) { + srcLid = permuteFromLIDs_d(srcLid); + tgtLid = permuteToLIDs_d(tgtLid); + } + auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); + if (srcGid == GINV) CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); + + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); + if (tgtLocalRow == LINV) CRSGRAPH_INNER_ABORT(__LINE__); + if (tgtLocalRow != tgtLid) CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + auto start = srcLocalRowPtrsDevice(srcLid); + auto end = srcLocalRowPtrsDevice(srcLid + 1); + auto rowLength = (end - start); + + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); + auto tend = tstart + tgtNumEntries; + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + size_t num_inserted = 0; + + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); + + size_t hint=0; + for (size_t j = 0; j < rowLength; j++) { + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) CRSGRAPH_INNER_ABORT(__LINE__); + auto numInTgtRow = (tend - tstart); + + const size_t offset = + KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, + numInTgtRow, + gi, hint, sorted); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + Kokkos::abort("num_avail"); + } + tgtGlobalColIndsPtr[tstart + offset] = gi; + ++tend; + hint = offset + 1; + ++num_inserted; + } + } + k_numRowEnt(tgtLocalRow) += num_inserted; + + return size_t(0); + }); + + Kokkos::fence("here 10"); + Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); + tgtCrsGraph.setLocallyModified(); + } + + + template + void + CrsGraph:: + copyAndPermuteNew(const row_graph_type& srcRowGraph, + row_graph_type& tgtRowGraph, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM) + { + using std::endl; + using LO = local_ordinal_type; + using GO = global_ordinal_type; + const char tfecfFuncName[] = "copyAndPermuteNew: "; + const bool verbose = verbose_; + + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteNew"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("CrsGraph", "copyAndPermuteNew"); + std::ostringstream os; + os << *prefix << endl; + std::cerr << os.str (); + } + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), + std::runtime_error, "permuteToLIDs.extent(0) = " + << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " + << permuteFromLIDs.extent (0) << "."); + + if (verbose) { + std::ostringstream os; + os << *prefix << "Compute padding" << endl; + std::cerr << os.str (); + } + + using crs_graph_type = CrsGraph; + const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error srcGraph type= " << typeid(srcRowGraph).name()); + const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; + + crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error tgtGraph type= " << typeid(tgtRowGraph).name()); + + crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; + + auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, + permuteToLIDs, permuteFromLIDs, verbose); + + tgtCrsGraph.applyCrsPadding(*padding, verbose); + + const map_type& srcRowMap = *(srcRowGraph.getRowMap()); + const map_type& tgtRowMap = *(tgtRowGraph.getRowMap()); + const bool src_filled = srcRowGraph.isFillComplete(); + nonconst_global_inds_host_view_type row_copy; + LO myid = 0; + + // + // "Copy" part of "copy and permute." + // + LO numSameIDs_as_LID = static_cast(numSameIDs); + + if (src_filled || srcCrsGraphPtr == nullptr) { + if (verbose) { + std::ostringstream os; + os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; + std::cerr << os.str (); + } + // If the source graph is fill complete, we can't use view mode, + // because the data might be stored in a different format not + // compatible with the expectations of view mode. Also, if the + // source graph is not a CrsGraph, we can't use view mode, + // because RowGraph only provides copy mode access to the data. +#if 0 + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); + } +#else + Kokkos::DualView noPermute; + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + noPermute, noPermute, + numSameIDs_as_LID); +#endif + } else { + if (verbose) { + std::ostringstream os; + os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; + std::cerr << os.str (); + } + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (gid, row); + tgtCrsGraph.insertGlobalIndices (gid, row.extent(0), row.data()); + } + } + + // + // "Permute" part of "copy and permute." + // + auto permuteToLIDs_h = permuteToLIDs.view_host (); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + + if (src_filled || srcCrsGraphPtr == nullptr) { +#if 0 + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); + } +#else + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + permuteToLIDs, permuteFromLIDs, // note reversed arg order, tgt, then src + static_cast (permuteToLIDs_h.extent (0))); +#endif + } else { + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (srcgid, row); + tgtCrsGraph.insertGlobalIndices (mygid, row.extent(0), row.data()); + } + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index f0eef6b3b32e..27883ad5f545 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -49,6 +49,7 @@ #include "KokkosSparse_spmv.hpp" #include +#include #include #include #include @@ -2423,15 +2424,105 @@ namespace Tpetra { const impl_scalar_type newVals[], const LocalOrdinal numElts) { - Teuchos::ArrayView indsT(inds, numElts); - auto fun = - [&](size_t const k, size_t const /*start*/, size_t const offset) { - rowVals[offset] = newVals[k]; - }; - std::function cb(std::ref(fun)); - return graph.findGlobalIndices(rowInfo, indsT, cb); + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + [[maybe_unused]] LocalOrdinal niv=0; + + { // new + typedef LocalOrdinal LO; + typedef GlobalOrdinal GO; + + const bool sorted = graph.isSorted (); + const bool atomic = useAtomicUpdatesByDefault; // FIXME + size_t hint = 0; // guess at the index's relative offset in the row + LO numValid = 0; // number of valid input column indices + + if (graph.isLocallyIndexed ()) { + // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its + // pointer does NOT change its reference count. Thus, this + // code is still thread safe. + if (graph.colMap_.is_null ()) { + // NO input column indices are valid in this case, since if + // the column Map is null on the calling process, then the + // calling process owns no graph entries. + return numValid; + } + const map_type& colMap = * (graph.colMap_); + + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getLocalIndsViewHost (rowInfo); + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset]= newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + } + else if (graph.isGloballyIndexed ()) { + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getGlobalIndsViewHost (rowInfo); + + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset] = newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + // If the graph is neither locally nor globally indexed on the + // calling process, that means the calling process has no graph + // entries. Thus, none of the input column indices are valid. + return numValid; + } + return LINV; } + template LocalOrdinal CrsMatrix:: @@ -2466,8 +2557,8 @@ namespace Tpetra { return Teuchos::OrdinalTraits::invalid (); } const crs_graph_type& graph = * (this->staticGraph_); - const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow); + if (rowInfo.localRow == Teuchos::OrdinalTraits::invalid ()) { // The input local row is invalid on the calling process, // which means that the calling process summed 0 entries. @@ -2475,6 +2566,7 @@ namespace Tpetra { } auto curRowVals = this->getValuesViewHostNonConst (rowInfo); + const IST* const inVals = reinterpret_cast (inputVals); return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo, inputGblColInds, inVals, numEnt); @@ -3235,11 +3327,10 @@ CrsMatrix:: const map_type& colMap = * (staticGraph_->colMap_); auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo); auto curVals = getValuesViewHost(rowinfo); - - for (size_t j = 0; j < theNumEntries; ++j) { - values[j] = curVals[j]; - indices[j] = colMap.getGlobalElement (curLclInds(j)); - } + bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, indices.data()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); + // FIXME - this should/could be a kokkos deep copy? + std::memcpy((void*)values.data(), (const void*) curVals.data(), numEntries*sizeof(*values.data())); } else if (staticGraph_->isGloballyIndexed ()) { auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo); @@ -3469,7 +3560,7 @@ CrsMatrix:: setAllValues ( const local_matrix_device_type& localDeviceMatrix) { using ProfilingRegion=Details::ProfilingRegion; - ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix"); + ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues1 from KokkosSparse::CrsMatrix"); auto graph = localDeviceMatrix.graph; //FIXME how to check whether graph is allocated @@ -3495,7 +3586,7 @@ CrsMatrix:: typedef impl_scalar_type IST; typedef typename local_graph_device_type::row_map_type row_map_type; //typedef typename row_map_type::non_const_value_type row_offset_type; - const char tfecfFuncName[] = "setAllValues(ArrayRCP, ArrayRCP, ArrayRCP): "; + const char tfecfFuncName[] = "setAllValues2(ArrayRCP, ArrayRCP, ArrayRCP): "; // The row offset type may depend on the execution space. It may // not necessarily be size_t. If it's not, we need to make a deep @@ -5654,6 +5745,16 @@ CrsMatrix:: myGraph_->setRowPtrsUnpacked(row_ptr_beg); } + template + void + copyAndPermuteStaticGraphNew( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes); + template void CrsMatrix:: @@ -5688,41 +5789,137 @@ CrsMatrix:: verbose ? prefix.get()->c_str() : nullptr; const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); + const bool targetIsLocallyIndexed = this->isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // - const map_type& srcRowMap = * (srcMat.getRowMap ()); + const auto& srcRowMap = * (srcMat.getRowMap ()); nonconst_global_inds_host_view_type rowInds; nonconst_values_host_view_type rowVals; const LO numSameIDs_as_LID = static_cast (numSameIDs); - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - ArrayViewrowIndsConstView; - ArrayView rowValsConstView; + // FIXME - need to examine this path + if (0 && targetIsLocallyIndexed && sourceIsLocallyIndexed) { + + // Create a mapping from the source's local column id's to my local column ids + using DT = typename Node::device_type; + const map_type& src_col_map = *(srcMat.getColMap()); + const map_type& tgt_col_map = *(this->getColMap()); + + auto local_src_col_map = src_col_map.getLocalMap(); + auto local_tgt_col_map = tgt_col_map.getLocalMap(); + + auto invalid = Teuchos::OrdinalTraits::invalid(); + LO num_src_cols = static_cast(src_col_map.getLocalNumElements()); + Kokkos::UnorderedMap lid_map(num_src_cols); + for (LO src_local_col_idx=0; src_local_col_idxgetLocalRowView(local_row, tgt_local_cols, tgt_local_vals); + + Kokkos::View indices("tgt_local_cols", src_local_cols.extent(0)); + Kokkos::View values("tgt_local_vals", src_local_cols.extent(0)); + size_t idx = 0; + for (size_t offset=0; offset indices_const(indices.data(), indices.size()); + const impl_scalar_type* const values_const_data = reinterpret_cast(values.data()); + Kokkos::View values_const(values_const_data, values.size()); + auto inds = Kokkos::subview(indices_const, Kokkos::make_pair(size_t(0), idx)); + auto vals = Kokkos::subview(values_const, Kokkos::make_pair(size_t(0), idx)); + this->replaceLocalValues(local_row, inds, vals); + } + } else if (sourceIsLocallyIndexed) { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + ArrayView rowIndsConstView; + ArrayView rowValsConstView; - if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); + if (rowLength > static_cast (rowInds.size())) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + + nonconst_global_inds_host_view_type rowIndsView; + nonconst_values_host_view_type rowValsView; // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because // they have to be converted from LIDs). size_t checkRowLength = 0; - srcMat.getGlobalRowCopy (sourceGID, rowIndsView, - rowValsView, checkRowLength); + + { + using crs_matrix_type = CrsMatrix; + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + auto globalRow = sourceGID; + auto StaticGraphRCP = srcMatCrs.getGraph(); + const crs_graph_type *StaticGraphPtr = dynamic_cast(StaticGraphRCP.get()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(StaticGraphPtr == nullptr, std::runtime_error, "bad StaticGraphPtr"); + const crs_graph_type& StaticGraph = *StaticGraphPtr; + const RowInfo rowinfo = StaticGraph.getRowInfoFromGlobalRowIndex (globalRow); + const size_t theNumEntries = rowinfo.numEntries; + checkRowLength = theNumEntries; // first side effect + auto numEntries = theNumEntries; + + if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { + if (StaticGraph.isLocallyIndexed ()) { + const map_type& colMap = * (StaticGraph.getColMap()); + auto curLclInds = StaticGraph.getLocalIndsViewHost(rowinfo); + auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); + rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + auto rowIndsViewLocal = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); + bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, rowIndsViewLocal.data()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); + } + else if (StaticGraph.isGloballyIndexed ()) { + auto rowIndsViewLocal = StaticGraph.getGlobalIndsViewHost(rowinfo); + rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); + auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); + rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + } + } + } + if (debug) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC (rowLength != checkRowLength, std::logic_error, "For " @@ -5732,20 +5929,21 @@ CrsMatrix:: "a row length of " << checkRowLength << "." << suffix); } - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - } - else { // source matrix is globally indexed. + combineGlobalValues(targetGID, rowIndsConstView, + rowValsConstView, REPLACE, + prefix_raw, debug, verbose); + } // for (sourceLID... + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + ArrayViewrowIndsConstView; + ArrayView rowValsConstView; + global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); @@ -5762,13 +5960,12 @@ CrsMatrix:: // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + combineGlobalValues(targetGID, rowIndsConstView, + rowValsConstView, REPLACE, + prefix_raw, debug, verbose); } - - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); } if (verbose) { @@ -5776,6 +5973,10 @@ CrsMatrix:: os << *prefix << "Do permutes" << endl; } + // + // "Permute" part of "copy and permute." + // + const map_type& tgtRowMap = * (this->getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); @@ -6092,15 +6293,31 @@ CrsMatrix:: using RMT = RowMatrix; const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { - TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); - auto permuteToLIDs_h = permuteToLIDs.view_host (); - TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + if (1) + { + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + + copyAndPermuteStaticGraphNew(srcMat, *this, + numSameIDs, + permuteToLIDs_d.data(), + permuteFromLIDs_d.data(), + numPermute); + } + else { + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); + auto permuteToLIDs_h = permuteToLIDs.view_host (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + + copyAndPermuteStaticGraph(srcMat, numSameIDs, + permuteToLIDs_h.data(), + permuteFromLIDs_h.data(), + numPermute); - copyAndPermuteStaticGraph(srcMat, numSameIDs, - permuteToLIDs_h.data(), - permuteFromLIDs_h.data(), - numPermute); + } } else { copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs, @@ -6819,17 +7036,17 @@ CrsMatrix:: const bool verbose) { const char tfecfFuncName[] = "combineGlobalValues: "; - - if (isStaticGraph ()) { + const bool isg = isStaticGraph (); + if (isg) { // INSERT doesn't make sense for a static graph, since you // aren't allowed to change the structure of the graph. // However, all the other combine modes work. - if (combineMode == ADD) { - sumIntoGlobalValues (globalRowIndex, columnIndices, values); - } - else if (combineMode == REPLACE) { + if (combineMode == REPLACE) { replaceGlobalValues (globalRowIndex, columnIndices, values); } + else if (combineMode == ADD) { + sumIntoGlobalValues (globalRowIndex, columnIndices, values); + } else if (combineMode == ABSMAX) { using ::Tpetra::Details::AbsMax; AbsMax f; @@ -7053,6 +7270,7 @@ CrsMatrix:: } if (isStaticGraph ()) { + using Details::unpackCrsMatrixAndCombineNew; unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID, importLIDs, constantNumPackets, @@ -8842,6 +9060,7 @@ CrsMatrix:: #ifdef HAVE_TPETRA_MMM_TIMINGS Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries"))); #endif + Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d); @@ -9193,6 +9412,255 @@ CrsMatrix:: transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params); } + template + void + copyAndPermuteStaticGraphNew(const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes) + { + using Details::ProfilingRegion; + using Teuchos::Array; + //using Teuchos::ArrayView; + using std::endl; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + + using impl_scalar_type = typename Kokkos::ArithTraits::val_type; + + using crs_matrix_type = CrsMatrix; + + typedef typename crs_matrix_type::local_inds_device_view_type::non_const_value_type local_inds_device_value_t; + typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; + + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + const char tfecfFuncName[] = "copyAndPermuteStaticGraphNew"; + ProfilingRegion regionCAP + ("Tpetra::CrsMatrix::copyAndPermuteStaticGraphNew"); + + // const bool debug = Details::Behavior::debug("CrsGraph"); + // const bool verbose = Details::Behavior::verbose("CrsGraph"); + + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + TEUCHOS_TEST_FOR_EXCEPTION(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); + TEUCHOS_TEST_FOR_EXCEPTION(tgtMatCrsPtr == nullptr, std::runtime_error, "bad tgtMatCrsPtr"); + crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; + + std::string prefix = tfecfFuncName; + // const char* const prefix_raw = prefix.c_str(); + + const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); + // + // Copy the first numSame row from source to target (this matrix). + // This involves copying rows corresponding to LIDs [0, numSame-1]. + // + const auto& srcRowMap = * (srcMat.getRowMap ()); + auto comm = srcRowMap.getComm(); + + const LO numSameIDs_as_LID = static_cast (numSameIDs); + + auto my_replaceGlobalValuesImpl_scalar + = KOKKOS_LAMBDA( + const bool sorted, const bool atomic, size_t hint[], + const size_t numInTgtRow, const local_inds_device_value_t tgtColInds[], impl_scalar_type tgtRowVals[], + const local_inds_device_value_t lclColInd, const impl_scalar_type newVals + ) -> LO + { + LO numValid = 0; // number of valid input column indices + + if (atomic) { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + Kokkos::atomic_store (&tgtRowVals[offset], newVals); + hint[0] = offset + 1; + numValid++; + } + } + } else { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + tgtRowVals[offset] = newVals; + hint[0] = offset + 1; + numValid++; + } + } + } + return numValid; + }; + + if (sourceIsLocallyIndexed) { + + const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); + + typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); + typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); + typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); + + bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); + + using local_map_type = typename crs_matrix_type::map_type::local_map_type; + + local_map_type local_map = srcMat.getRowMap()->getLocalMap(); + local_map_type local_col_map = srcMat.getColMap()->getLocalMap(); + local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); + local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); + + auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); + auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); + + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + local_inds_device_value_t start = srcLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); + local_inds_device_value_t rowLength = (end - start); + + local_inds_device_value_t tstart = tgtLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); + local_inds_device_value_t numInTgtRow = (tend - tstart); + + KOKKOS_ASSERT(tstart < tvals.extent(0)); + impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); + const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); + + size_t hint=0; + for (LO j = 0; j < rowLength; j++) { + local_inds_device_value_t ci = srcLocalColIndsDevice(start + j); + GO gi = local_col_map.getGlobalElement(ci); + const local_inds_device_value_t lclColInd = tgt_local_col_map.getLocalElement(gi); + my_replaceGlobalValuesImpl_scalar(tgtMatIsSorted, false, &hint, + numInTgtRow, tgtColInds, tgtRowVals, + lclColInd, vals(start+j)); + } + + }); // kokkos parallel_for + + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + // FIXME - need to apply the same approach as above, maybe reuse my_replaceGlobalValuesImpl_scalar? + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + } + + // FIXME - need to apply the same approach as above to the permutes + + // + // "Permute" part of "copy and permute." + // + typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; + typename crs_matrix_type::nonconst_values_host_view_type rowVals; + + const auto& tgtRowMap = * (tgtMat.getRowMap ()); + for (size_t p = 0; p < numPermutes; ++p) { + const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); + const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + if (sourceIsLocallyIndexed) { + const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); + if (rowLength > static_cast (rowInds.size ())) { + Kokkos::resize(rowInds,rowLength); + Kokkos::resize(rowVals,rowLength); + } + // Resizing invalidates an Array's views, so we must make new + // ones, even if rowLength hasn't changed. + typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + + // The source matrix is locally indexed, so we have to get a + // copy. Really it's the GIDs that have to be copied (because + // they have to be converted from LIDs). + size_t checkRowLength = 0; + srcMat.getGlobalRowCopy(sourceGID, rowIndsView, + rowValsView, checkRowLength); + + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + else { + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + + } + } // namespace Tpetra // diff --git a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp index cc17543ec7c1..24072bc0996b 100644 --- a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp @@ -624,7 +624,15 @@ class WrappedDualView { // We check to see if the memory is not aliased *or* if it is a supported // (heterogeneous memory) accelerator (for shared host/device memory). - return !memoryIsAliased() || Spaces::is_gpu_exec_space(); + if constexpr(Spaces::is_gpu_exec_space()) { + return true; + } else { + if constexpr(!deviceMemoryIsHostAccessible) { + return true; + } else { + return dualView.h_view.data() != dualView.d_view.data(); + } + } } diff --git a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp index 4496f17eb12a..05d484574224 100644 --- a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp @@ -20,6 +20,7 @@ #include #include #include +#include /// \file Tpetra_Details_crsUtils.hpp /// \brief Functions for manipulating CRS arrays @@ -422,8 +423,8 @@ insert_crs_indices( if (idx == cur_indices[row_offset]) { break; } - } - + } + if (row_offset == end) { if (num_inserted >= num_avail) { // not enough room return Teuchos::OrdinalTraits::invalid(); @@ -499,22 +500,67 @@ find_crs_indices( size_t num_found = 0; for (size_t k = 0; k < new_indices.size(); k++) { - auto row_offset = start; auto idx = std::forward(map)(new_indices[k]); if (idx == invalid_ordinal) continue; - for (; row_offset < end; row_offset++) + for (size_t row_offset = start; row_offset < end; row_offset++) { - if (idx == cur_indices[row_offset]) + size_t off = row_offset - start; + auto lidx = cur_indices[row_offset]; + if (idx == lidx) { - std::forward(cb)(k, start, row_offset - start); + std::forward(cb)(k, start, off); num_found++; + // FIXME why no break here, can an index be found twice? + // break; } } } return num_found; } +/// \brief Implementation of findCrsIndices +template +size_t +find_crs_indices_sorted( + typename Pointers::value_type const row, + Pointers const& row_ptrs, + const size_t curNumEntries, + Indices1 const& cur_indices, + Indices2 const& new_indices, + IndexMap&& map, + Callback&& cb) +{ + if (new_indices.size() == 0) + return 0; + + using ordinal = + typename std::remove_const::type; + auto invalid_ordinal = Teuchos::OrdinalTraits::invalid(); + + const size_t start = static_cast (row_ptrs[row]); + const size_t end = start + curNumEntries; + size_t num_found = 0; + for (size_t k = 0; k < new_indices.size(); k++) + { + auto idx = std::forward(map)(new_indices[k]); + if (idx == invalid_ordinal) + continue; + + // FIXME use kokkos findRelOffset + auto first = &cur_indices[start]; + auto first0 = first; + auto last = &cur_indices[end]; + first = std::lower_bound(first, last, idx); + size_t off = first - first0; + if (first != last && !(idx < *first)) { + std::forward(cb)(k, start, off); + num_found++; + } + } + return num_found; +} + } // namespace impl @@ -718,6 +764,20 @@ findCrsIndices( return impl::find_crs_indices(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); } +template +size_t +findCrsIndicesSorted( + typename Pointers::value_type const row, + Pointers const& rowPtrs, + const size_t curNumEntries, + Indices1 const& curIndices, + Indices2 const& newIndices, + IndexMap&& map, + Callback&& cb) +{ + return impl::find_crs_indices_sorted(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); +} + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Map_decl.hpp b/packages/tpetra/core/src/Tpetra_Map_decl.hpp index 2ea5ee6f343e..4b406405e9f9 100644 --- a/packages/tpetra/core/src/Tpetra_Map_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_decl.hpp @@ -642,6 +642,7 @@ namespace Tpetra { /// the same value as /// Teuchos::OrdinalTraits::invalid(). global_ordinal_type getGlobalElement (local_ordinal_type localIndex) const; + bool getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const; /// \brief Get the LocalMap for Kokkos-Kernels. /// diff --git a/packages/tpetra/core/src/Tpetra_Map_def.hpp b/packages/tpetra/core/src/Tpetra_Map_def.hpp index c6b028e8f616..f697ab3aa8bb 100644 --- a/packages/tpetra/core/src/Tpetra_Map_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_def.hpp @@ -1066,7 +1066,7 @@ namespace Tpetra { // beginning of the range starts with the first entry. While // doing so, fill in the LID -> GID table. typename decltype (lgMap_)::non_const_type lgMap - (view_alloc ("lgMap", WithoutInitializing), numLocalElements_); + (view_alloc ("lgMap2", WithoutInitializing), numLocalElements_); // Because you can't use lambdas in constructors on CUDA. Or using private/protected data. // DEEP_COPY REVIEW - DEVICE-TO-DEVICE @@ -1274,6 +1274,40 @@ namespace Tpetra { } } + template + bool + Map:: + getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const + { + auto const minGI = getMinGlobalIndex(); + auto const minLI = getMinLocalIndex(); + auto const maxLI = getMaxLocalIndex(); + if (isContiguous ()) { + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = minGI + lclInd; + } + } + else { + // This is a host Kokkos::View access, with no RCP or ArrayRCP + // involvement. As a result, it is thread safe. + // + // lgMapHost_ is a host pointer; this does NOT assume UVM. + lazyPushToHost(); + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = lgMapHost_[lclInd]; + } + } + return false; + } + template bool Map:: @@ -1662,7 +1696,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap3", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl; @@ -1749,7 +1783,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap4", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl; From 4bf45065c01c124f103b35dd4cd890850a94adf2 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 21 Jan 2025 15:21:53 -0700 Subject: [PATCH 2/7] Tpetra CrsMatrix: Fix warning Signed-off-by: Christian Glusa --- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 27883ad5f545..32e649fd64a6 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -9539,7 +9539,7 @@ CrsMatrix:: local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); local_inds_device_value_t numInTgtRow = (tend - tstart); - KOKKOS_ASSERT(tstart < tvals.extent(0)); + KOKKOS_ASSERT(static_cast(tstart) < tvals.extent(0)); impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); From 0cbaea9b3b4c3e7288e94dfd8dd33a5ed401706a Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Tue, 28 Jan 2025 10:05:11 -0700 Subject: [PATCH 3/7] remove temporary code Signed-off-by: Tim Fuller --- .../tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 2 +- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 321 ++----- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 820 ++++++------------ 3 files changed, 357 insertions(+), 786 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 01c1f3ac6d2b..8949499e695f 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1204,7 +1204,7 @@ namespace Tpetra { LocalOrdinal loopEnd); void - copyAndPermuteNew + copyAndPermuteImpl (const row_graph_type& source, row_graph_type& target, const size_t numSameIDs, diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 6fcf2700c066..ca3bb53ada34 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -4825,125 +4825,9 @@ namespace Tpetra { using LO = local_ordinal_type; using GO = global_ordinal_type; using this_CRS_type = CrsGraph; - const char tfecfFuncName[] = "copyAndPermute: "; - const bool verbose = verbose_; - - if (true) { - const row_graph_type& srcRowGraph = - dynamic_cast (source); - copyAndPermuteNew(srcRowGraph, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, INSERT); - return; - } - - Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("CrsGraph", "copyAndPermute"); - std::ostringstream os; - os << *prefix << endl; - std::cerr << os.str (); - } - - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), - std::runtime_error, "permuteToLIDs.extent(0) = " - << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " - << permuteFromLIDs.extent (0) << "."); - - // We know from checkSizes that the source object is a - // row_graph_type, so we don't need to check again. - const row_graph_type& srcRowGraph = - dynamic_cast (source); - - if (verbose) { - std::ostringstream os; - os << *prefix << "Compute padding" << endl; - std::cerr << os.str (); - } - auto padding = computeCrsPadding(srcRowGraph, numSameIDs, - permuteToLIDs, permuteFromLIDs, verbose); - - applyCrsPadding(*padding, verbose); - - // If the source object is actually a CrsGraph, we can use view - // mode instead of copy mode to access the entries in each row, - // if the graph is not fill complete. - const this_CRS_type* srcCrsGraph = - dynamic_cast (&source); - - const map_type& srcRowMap = *(srcRowGraph.getRowMap()); - const map_type& tgtRowMap = *(getRowMap()); - const bool src_filled = srcRowGraph.isFillComplete(); - nonconst_global_inds_host_view_type row_copy; - LO myid = 0; - - // - // "Copy" part of "copy and permute." - // - if (src_filled || srcCrsGraph == nullptr) { - if (verbose) { - std::ostringstream os; - os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; - std::cerr << os.str (); - } - // If the source graph is fill complete, we can't use view mode, - // because the data might be stored in a different format not - // compatible with the expectations of view mode. Also, if the - // source graph is not a CrsGraph, we can't use view mode, - // because RowGraph only provides copy mode access to the data. - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); - this->insertGlobalIndices (gid, row_length, row_copy.data()); - } - } else { - if (verbose) { - std::ostringstream os; - os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; - std::cerr << os.str (); - } - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - global_inds_host_view_type row; - srcCrsGraph->getGlobalRowView (gid, row); - this->insertGlobalIndices (gid, row.extent(0), row.data()); - } - } - - // - // "Permute" part of "copy and permute." - // - auto permuteToLIDs_h = permuteToLIDs.view_host (); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); - - if (src_filled || srcCrsGraph == nullptr) { - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); - this->insertGlobalIndices (mygid, row_length, row_copy.data()); - } - } else { - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - global_inds_host_view_type row; - srcCrsGraph->getGlobalRowView (srcgid, row); - this->insertGlobalIndices (mygid, row.extent(0), row.data()); - } - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Done" << endl; - std::cerr << os.str (); - } + const row_graph_type& srcRowGraph = dynamic_cast (source); + copyAndPermuteImpl(srcRowGraph, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, INSERT); + return; } template @@ -7661,66 +7545,69 @@ namespace Tpetra { Kokkos::abort("error"); \ } while(0) - Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", - range_type (0, loopEnd), - KOKKOS_LAMBDA(const LO sourceLID) - { - auto srcLid = sourceLID; - auto tgtLid = sourceLID; - if (hasMap) { - srcLid = permuteFromLIDs_d(srcLid); - tgtLid = permuteToLIDs_d(tgtLid); - } - auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); - if (srcGid == GINV) CRSGRAPH_INNER_ABORT(__LINE__); - auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); - - auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); - if (tgtLocalRow == LINV) CRSGRAPH_INNER_ABORT(__LINE__); - if (tgtLocalRow != tgtLid) CRSGRAPH_INNER_ABORT(__LINE__); - auto tgtNumEntries = k_numRowEnt(tgtLocalRow); - - // FIXME no auto use - auto start = srcLocalRowPtrsDevice(srcLid); - auto end = srcLocalRowPtrsDevice(srcLid + 1); - auto rowLength = (end - start); - - auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); - auto tend = tstart + tgtNumEntries; - auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); - - const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; - size_t num_inserted = 0; - - global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); - - size_t hint=0; - for (size_t j = 0; j < rowLength; j++) { - auto ci = srcLocalColIndsDevice(start + j); - GO gi = srcColMapLocal.getGlobalElement(ci); - if (gi == GINV) CRSGRAPH_INNER_ABORT(__LINE__); - auto numInTgtRow = (tend - tstart); - - const size_t offset = - KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, - numInTgtRow, - gi, hint, sorted); - - if (offset == numInTgtRow) { - if (num_inserted >= num_avail) { // not enough room - Kokkos::abort("num_avail"); - } - tgtGlobalColIndsPtr[tstart + offset] = gi; - ++tend; - hint = offset + 1; - ++num_inserted; - } - } - k_numRowEnt(tgtLocalRow) += num_inserted; - - return size_t(0); - }); + Kokkos::parallel_for( + "Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, loopEnd), + KOKKOS_LAMBDA(const LO sourceLID) { + auto srcLid = sourceLID; + auto tgtLid = sourceLID; + if (hasMap) { + srcLid = permuteFromLIDs_d(srcLid); + tgtLid = permuteToLIDs_d(tgtLid); + } + auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); + if (srcGid == GINV) + CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); + + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); + if (tgtLocalRow == LINV) + CRSGRAPH_INNER_ABORT(__LINE__); + if (tgtLocalRow != tgtLid) + CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + auto start = srcLocalRowPtrsDevice(srcLid); + auto end = srcLocalRowPtrsDevice(srcLid + 1); + auto rowLength = (end - start); + + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); + auto tend = tstart + tgtNumEntries; + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + size_t num_inserted = 0; + + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); + + size_t hint=0; + for (size_t j = 0; j < rowLength; j++) { + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) + CRSGRAPH_INNER_ABORT(__LINE__); + auto numInTgtRow = (tend - tstart); + + const size_t offset = KokkosSparse::findRelOffset( + tgtGlobalColIndsPtr+tstart, numInTgtRow, gi, hint, sorted + ); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + Kokkos::abort("num_avail"); + } + tgtGlobalColIndsPtr[tstart + offset] = gi; + ++tend; + hint = offset + 1; + ++num_inserted; + } + } + k_numRowEnt(tgtLocalRow) += num_inserted; + return size_t(0); + } + ); Kokkos::fence("here 10"); Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); tgtCrsGraph.setLocallyModified(); @@ -7730,25 +7617,26 @@ namespace Tpetra { template void CrsGraph:: - copyAndPermuteNew(const row_graph_type& srcRowGraph, - row_graph_type& tgtRowGraph, - const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs, - const Kokkos::DualView& permuteFromLIDs, - const CombineMode CM) - { + copyAndPermuteImpl( + const row_graph_type& srcRowGraph, + row_graph_type& tgtRowGraph, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM + ) { using std::endl; using LO = local_ordinal_type; using GO = global_ordinal_type; - const char tfecfFuncName[] = "copyAndPermuteNew: "; + const char tfecfFuncName[] = "copyAndPermuteImpl: "; const bool verbose = verbose_; - Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteNew"); + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteImpl"); std::unique_ptr prefix; if (verbose) { - prefix = this->createPrefix("CrsGraph", "copyAndPermuteNew"); + prefix = this->createPrefix("CrsGraph", "copyAndPermuteImpl"); std::ostringstream os; os << *prefix << endl; std::cerr << os.str (); @@ -7768,18 +7656,21 @@ namespace Tpetra { using crs_graph_type = CrsGraph; const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, - "error srcGraph type= " << typeid(srcRowGraph).name()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + !srcCrsGraphPtr, std::runtime_error, "error srcGraph type= " << typeid(srcRowGraph).name() + ); const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, - "error tgtGraph type= " << typeid(tgtRowGraph).name()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + !srcCrsGraphPtr, std::runtime_error, "error tgtGraph type= " << typeid(tgtRowGraph).name() + ); crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; - auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, - permuteToLIDs, permuteFromLIDs, verbose); + auto padding = tgtCrsGraph.computeCrsPadding( + srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose + ); tgtCrsGraph.applyCrsPadding(*padding, verbose); @@ -7805,21 +7696,8 @@ namespace Tpetra { // compatible with the expectations of view mode. Also, if the // source graph is not a CrsGraph, we can't use view mode, // because RowGraph only provides copy mode access to the data. -#if 0 - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); - tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); - } -#else Kokkos::DualView noPermute; - insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, - noPermute, noPermute, - numSameIDs_as_LID); -#endif + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, noPermute, noPermute, numSameIDs_as_LID); } else { if (verbose) { std::ostringstream os; @@ -7843,21 +7721,14 @@ namespace Tpetra { auto permuteFromLIDs_d = permuteFromLIDs.view_device (); if (src_filled || srcCrsGraphPtr == nullptr) { -#if 0 - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); - tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); - } -#else - insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, - permuteToLIDs, permuteFromLIDs, // note reversed arg order, tgt, then src - static_cast (permuteToLIDs_h.extent (0))); -#endif + // note reversed arg order, tgt, then src + insertGlobalIndicesDevice( + srcCrsGraph, + tgtCrsGraph, + permuteToLIDs, + permuteFromLIDs, + static_cast (permuteToLIDs_h.extent (0)) + ); } else { for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 32e649fd64a6..0b9fb9cb05d6 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -5746,14 +5746,14 @@ CrsMatrix:: } template - void - copyAndPermuteStaticGraphNew( - const RowMatrix& srcMat, - RowMatrix& tgtMat, - const size_t numSameIDs, - const LocalOrdinal permuteToLIDs[], - const LocalOrdinal permuteFromLIDs[], - const size_t numPermutes); + void copyAndPermuteStaticGraphImpl( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes + ); template void @@ -5764,6 +5764,22 @@ CrsMatrix:: const LocalOrdinal permuteToLIDs[], const LocalOrdinal permuteFromLIDs[], const size_t numPermutes) + { + copyAndPermuteStaticGraphImpl( + srcMat, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, numPermutes + ); + return; + } + + template + void + CrsMatrix:: + copyAndPermuteNonStaticGraph( + const RowMatrix& srcMat, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs_dv, + const Kokkos::DualView& permuteFromLIDs_dv, + const size_t numPermutes) { using Details::ProfilingRegion; using Teuchos::Array; @@ -5771,11 +5787,11 @@ CrsMatrix:: using std::endl; using LO = LocalOrdinal; using GO = GlobalOrdinal; - const char tfecfFuncName[] = "copyAndPermuteStaticGraph"; + const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph"; const char suffix[] = " Please report this bug to the Tpetra developers."; ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph"); + ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph"); const bool debug = Details::Behavior::debug("CrsGraph"); const bool verbose = Details::Behavior::verbose("CrsGraph"); @@ -5788,165 +5804,69 @@ CrsMatrix:: const char* const prefix_raw = verbose ? prefix.get()->c_str() : nullptr; + { + using row_graph_type = RowGraph; + const row_graph_type& srcGraph = *(srcMat.getGraph()); + auto padding = + myGraph_->computeCrsPadding(srcGraph, numSameIDs, + permuteToLIDs_dv, permuteFromLIDs_dv, verbose); + applyCrsPadding(*padding, verbose); + } const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); - const bool targetIsLocallyIndexed = this->isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // - const auto& srcRowMap = * (srcMat.getRowMap ()); - nonconst_global_inds_host_view_type rowInds; - nonconst_values_host_view_type rowVals; + const map_type& srcRowMap = * (srcMat.getRowMap ()); const LO numSameIDs_as_LID = static_cast (numSameIDs); + using gids_type = nonconst_global_inds_host_view_type; + using vals_type = nonconst_values_host_view_type; + gids_type rowInds; + vals_type rowVals; + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; - // FIXME - need to examine this path - if (0 && targetIsLocallyIndexed && sourceIsLocallyIndexed) { - - // Create a mapping from the source's local column id's to my local column ids - using DT = typename Node::device_type; - const map_type& src_col_map = *(srcMat.getColMap()); - const map_type& tgt_col_map = *(this->getColMap()); - - auto local_src_col_map = src_col_map.getLocalMap(); - auto local_tgt_col_map = tgt_col_map.getLocalMap(); - - auto invalid = Teuchos::OrdinalTraits::invalid(); - LO num_src_cols = static_cast(src_col_map.getLocalNumElements()); - Kokkos::UnorderedMap lid_map(num_src_cols); - for (LO src_local_col_idx=0; src_local_col_idxgetLocalRowView(local_row, tgt_local_cols, tgt_local_vals); - - Kokkos::View indices("tgt_local_cols", src_local_cols.extent(0)); - Kokkos::View values("tgt_local_vals", src_local_cols.extent(0)); - size_t idx = 0; - for (size_t offset=0; offset indices_const(indices.data(), indices.size()); - const impl_scalar_type* const values_const_data = reinterpret_cast(values.data()); - Kokkos::View values_const(values_const_data, values.size()); - auto inds = Kokkos::subview(indices_const, Kokkos::make_pair(size_t(0), idx)); - auto vals = Kokkos::subview(values_const, Kokkos::make_pair(size_t(0), idx)); - this->replaceLocalValues(local_row, inds, vals); - } - } else if (sourceIsLocallyIndexed) { - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; + ArrayView rowIndsConstView; + ArrayView rowValsConstView; - ArrayView rowIndsConstView; - ArrayView rowValsConstView; + if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - - if (rowLength > static_cast (rowInds.size())) { + if (rowLength > static_cast (rowInds.extent(0))) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - - nonconst_global_inds_host_view_type rowIndsView; - nonconst_values_host_view_type rowValsView; + gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because // they have to be converted from LIDs). size_t checkRowLength = 0; - - { - using crs_matrix_type = CrsMatrix; - const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); - const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; - - auto globalRow = sourceGID; - auto StaticGraphRCP = srcMatCrs.getGraph(); - const crs_graph_type *StaticGraphPtr = dynamic_cast(StaticGraphRCP.get()); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(StaticGraphPtr == nullptr, std::runtime_error, "bad StaticGraphPtr"); - const crs_graph_type& StaticGraph = *StaticGraphPtr; - const RowInfo rowinfo = StaticGraph.getRowInfoFromGlobalRowIndex (globalRow); - const size_t theNumEntries = rowinfo.numEntries; - checkRowLength = theNumEntries; // first side effect - auto numEntries = theNumEntries; - - if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { - if (StaticGraph.isLocallyIndexed ()) { - const map_type& colMap = * (StaticGraph.getColMap()); - auto curLclInds = StaticGraph.getLocalIndsViewHost(rowinfo); - auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); - rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - auto rowIndsViewLocal = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); - bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, rowIndsViewLocal.data()); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); - } - else if (StaticGraph.isGloballyIndexed ()) { - auto rowIndsViewLocal = StaticGraph.getGlobalIndsViewHost(rowinfo); - rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); - auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); - rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - } - } - } - + srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, + checkRowLength); if (debug) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, "For " + (rowLength != checkRowLength, std::logic_error, ": For " "global row index " << sourceGID << ", the source " "matrix's getNumEntriesInGlobalRow returns a row length " "of " << rowLength << ", but getGlobalRowCopy reports " "a row length of " << checkRowLength << "." << suffix); } - - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); - } // for (sourceLID... - } else { - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - - ArrayViewrowIndsConstView; - ArrayView rowValsConstView; - + rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); + rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); + } + else { // source matrix is globally indexed. global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews @@ -5959,23 +5879,19 @@ CrsMatrix:: Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface - - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); } + + // Combine the data into the target matrix. + insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, + rowValsConstView, prefix_raw, debug, verbose); } if (verbose) { std::ostringstream os; os << *prefix << "Do permutes" << endl; } - - // - // "Permute" part of "copy and permute." - // + const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data(); + const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data(); const map_type& tgtRowMap = * (this->getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { @@ -5987,14 +5903,14 @@ CrsMatrix:: if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.size ())) { + if (rowLength > static_cast (rowInds.extent(0))) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because @@ -6010,24 +5926,14 @@ CrsMatrix:: rowLength << ", but getGlobalRowCopy a row length of " << checkRowLength << "." << suffix); } - - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface + rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); + rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); } else { global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews @@ -6042,9 +5948,9 @@ CrsMatrix:: // KDDKDD UVM TEMPORARY: KokkosView interface } - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); + // Combine the data into the target matrix. + insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, + rowValsConstView, prefix_raw, debug, verbose); } if (verbose) { @@ -6054,145 +5960,209 @@ CrsMatrix:: } template - void - CrsMatrix:: - copyAndPermuteNonStaticGraph( + void copyAndPermuteStaticGraphImpl( const RowMatrix& srcMat, + RowMatrix& tgtMat, const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs_dv, - const Kokkos::DualView& permuteFromLIDs_dv, - const size_t numPermutes) - { + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes + ) { using Details::ProfilingRegion; using Teuchos::Array; - using Teuchos::ArrayView; + //using Teuchos::ArrayView; using std::endl; using LO = LocalOrdinal; using GO = GlobalOrdinal; - const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph"; - const char suffix[] = - " Please report this bug to the Tpetra developers."; + + using impl_scalar_type = typename Kokkos::ArithTraits::val_type; + + using crs_matrix_type = CrsMatrix; + + typedef typename crs_matrix_type::local_inds_device_view_type::non_const_value_type local_inds_device_value_t; + typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; + + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + const char tfecfFuncName[] = "copyAndPermuteStaticGraphImpl"; ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph"); + ("Tpetra::CrsMatrix::copyAndPermuteStaticGraphImpl"); - const bool debug = Details::Behavior::debug("CrsGraph"); - const bool verbose = Details::Behavior::verbose("CrsGraph"); - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("CrsGraph", tfecfFuncName); - std::ostringstream os; - os << *prefix << "Start" << endl; - } - const char* const prefix_raw = - verbose ? prefix.get()->c_str() : nullptr; + // const bool debug = Details::Behavior::debug("CrsGraph"); + // const bool verbose = Details::Behavior::verbose("CrsGraph"); + + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + TEUCHOS_TEST_FOR_EXCEPTION(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); + TEUCHOS_TEST_FOR_EXCEPTION(tgtMatCrsPtr == nullptr, std::runtime_error, "bad tgtMatCrsPtr"); + crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; + + std::string prefix = tfecfFuncName; + // const char* const prefix_raw = prefix.c_str(); - { - using row_graph_type = RowGraph; - const row_graph_type& srcGraph = *(srcMat.getGraph()); - auto padding = - myGraph_->computeCrsPadding(srcGraph, numSameIDs, - permuteToLIDs_dv, permuteFromLIDs_dv, verbose); - applyCrsPadding(*padding, verbose); - } const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // - const map_type& srcRowMap = * (srcMat.getRowMap ()); + const auto& srcRowMap = * (srcMat.getRowMap ()); + auto comm = srcRowMap.getComm(); + const LO numSameIDs_as_LID = static_cast (numSameIDs); - using gids_type = nonconst_global_inds_host_view_type; - using vals_type = nonconst_values_host_view_type; - gids_type rowInds; - vals_type rowVals; - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - ArrayView rowIndsConstView; - ArrayView rowValsConstView; + auto my_replaceGlobalValuesImpl_scalar + = KOKKOS_LAMBDA( + const bool sorted, const bool atomic, size_t hint[], + const size_t numInTgtRow, const local_inds_device_value_t tgtColInds[], impl_scalar_type tgtRowVals[], + const local_inds_device_value_t lclColInd, const impl_scalar_type newVals + ) -> LO + { + LO numValid = 0; // number of valid input column indices - if (sourceIsLocallyIndexed) { + if (atomic) { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + Kokkos::atomic_store (&tgtRowVals[offset], newVals); + hint[0] = offset + 1; + numValid++; + } + } + } else { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + tgtRowVals[offset] = newVals; + hint[0] = offset + 1; + numValid++; + } + } + } + return numValid; + }; - const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.extent(0))) { - Kokkos::resize(rowInds,rowLength); - Kokkos::resize(rowVals,rowLength); - } - // Resizing invalidates an Array's views, so we must make new - // ones, even if rowLength hasn't changed. - gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + if (sourceIsLocallyIndexed) { - // The source matrix is locally indexed, so we have to get a - // copy. Really it's the GIDs that have to be copied (because - // they have to be converted from LIDs). - size_t checkRowLength = 0; - srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, - checkRowLength); - if (debug) { - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, ": For " - "global row index " << sourceGID << ", the source " - "matrix's getNumEntriesInGlobalRow returns a row length " - "of " << rowLength << ", but getGlobalRowCopy reports " - "a row length of " << checkRowLength << "." << suffix); - } - rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); - rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); - } - else { // source matrix is globally indexed. - global_inds_host_view_type rowIndsView; - values_host_view_type rowValsView; - srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); + + typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); + typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); + typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); + + bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); + + using local_map_type = typename crs_matrix_type::map_type::local_map_type; + + local_map_type local_map = srcMat.getRowMap()->getLocalMap(); + local_map_type local_col_map = srcMat.getColMap()->getLocalMap(); + local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); + local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); + + auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); + auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); + + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraphImpl", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + local_inds_device_value_t start = srcLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); + local_inds_device_value_t rowLength = (end - start); + + local_inds_device_value_t tstart = tgtLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); + local_inds_device_value_t numInTgtRow = (tend - tstart); + + KOKKOS_ASSERT(tstart < tvals.extent(0)); + impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); + const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); + + size_t hint=0; + for (LO j = 0; j < rowLength; j++) { + local_inds_device_value_t ci = srcLocalColIndsDevice(start + j); + GO gi = local_col_map.getGlobalElement(ci); + const local_inds_device_value_t lclColInd = tgt_local_col_map.getLocalElement(gi); + my_replaceGlobalValuesImpl_scalar(tgtMatIsSorted, false, &hint, + numInTgtRow, tgtColInds, tgtRowVals, + lclColInd, vals(start+j)); + } + + }); // kokkos parallel_for + + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface - } - // Combine the data into the target matrix. - insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, - rowValsConstView, prefix_raw, debug, verbose); + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + // FIXME - need to apply the same approach as above, maybe reuse my_replaceGlobalValuesImpl_scalar? + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } } - if (verbose) { - std::ostringstream os; - os << *prefix << "Do permutes" << endl; - } - const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data(); - const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data(); + // FIXME - need to apply the same approach as above to the permutes - const map_type& tgtRowMap = * (this->getRowMap ()); + // + // "Permute" part of "copy and permute." + // + typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; + typename crs_matrix_type::nonconst_values_host_view_type rowVals; + + const auto& tgtRowMap = * (tgtMat.getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); - ArrayView rowIndsConstView; - ArrayView rowValsConstView; + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.extent(0))) { + if (rowLength > static_cast (rowInds.size ())) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because @@ -6200,45 +6170,42 @@ CrsMatrix:: size_t checkRowLength = 0; srcMat.getGlobalRowCopy(sourceGID, rowIndsView, rowValsView, checkRowLength); - if (debug) { - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, "For " - "source matrix global row index " << sourceGID << ", " - "getNumEntriesInGlobalRow returns a row length of " << - rowLength << ", but getGlobalRowCopy a row length of " - << checkRowLength << "." << suffix); - } - rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); - rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); + + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface } else { - global_inds_host_view_type rowIndsView; - values_host_view_type rowValsView; + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface } - // Combine the data into the target matrix. - insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, - rowValsConstView, prefix_raw, debug, verbose); + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); } - if (verbose) { - std::ostringstream os; - os << *prefix << "Done" << endl; - } } template @@ -6293,31 +6260,13 @@ CrsMatrix:: using RMT = RowMatrix; const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { - if (1) - { - TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); - auto permuteToLIDs_d = permuteToLIDs.view_device (); - TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); - auto permuteFromLIDs_d = permuteFromLIDs.view_device (); - - copyAndPermuteStaticGraphNew(srcMat, *this, - numSameIDs, - permuteToLIDs_d.data(), - permuteFromLIDs_d.data(), - numPermute); - } - else { - TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); - auto permuteToLIDs_h = permuteToLIDs.view_host (); - TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); - - copyAndPermuteStaticGraph(srcMat, numSameIDs, - permuteToLIDs_h.data(), - permuteFromLIDs_h.data(), - numPermute); - - } + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + copyAndPermuteStaticGraph( + srcMat, numSameIDs, permuteToLIDs_d.data(), permuteFromLIDs_d.data(), numPermute + ); } else { copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs, @@ -9412,255 +9361,6 @@ CrsMatrix:: transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params); } - template - void - copyAndPermuteStaticGraphNew(const RowMatrix& srcMat, - RowMatrix& tgtMat, - const size_t numSameIDs, - const LocalOrdinal permuteToLIDs[], - const LocalOrdinal permuteFromLIDs[], - const size_t numPermutes) - { - using Details::ProfilingRegion; - using Teuchos::Array; - //using Teuchos::ArrayView; - using std::endl; - using LO = LocalOrdinal; - using GO = GlobalOrdinal; - - using impl_scalar_type = typename Kokkos::ArithTraits::val_type; - - using crs_matrix_type = CrsMatrix; - - typedef typename crs_matrix_type::local_inds_device_view_type::non_const_value_type local_inds_device_value_t; - typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; - - typedef typename Node::execution_space exec_space; - typedef Kokkos::RangePolicy range_type; - - const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); - - const char tfecfFuncName[] = "copyAndPermuteStaticGraphNew"; - ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteStaticGraphNew"); - - // const bool debug = Details::Behavior::debug("CrsGraph"); - // const bool verbose = Details::Behavior::verbose("CrsGraph"); - - const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); - TEUCHOS_TEST_FOR_EXCEPTION(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); - const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; - - crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); - TEUCHOS_TEST_FOR_EXCEPTION(tgtMatCrsPtr == nullptr, std::runtime_error, "bad tgtMatCrsPtr"); - crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; - - std::string prefix = tfecfFuncName; - // const char* const prefix_raw = prefix.c_str(); - - const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); - // - // Copy the first numSame row from source to target (this matrix). - // This involves copying rows corresponding to LIDs [0, numSame-1]. - // - const auto& srcRowMap = * (srcMat.getRowMap ()); - auto comm = srcRowMap.getComm(); - - const LO numSameIDs_as_LID = static_cast (numSameIDs); - - auto my_replaceGlobalValuesImpl_scalar - = KOKKOS_LAMBDA( - const bool sorted, const bool atomic, size_t hint[], - const size_t numInTgtRow, const local_inds_device_value_t tgtColInds[], impl_scalar_type tgtRowVals[], - const local_inds_device_value_t lclColInd, const impl_scalar_type newVals - ) -> LO - { - LO numValid = 0; // number of valid input column indices - - if (atomic) { - if (lclColInd != LINV) { - const size_t offset = - KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, - lclColInd, hint[0], sorted); - if (offset != numInTgtRow) { - Kokkos::atomic_store (&tgtRowVals[offset], newVals); - hint[0] = offset + 1; - numValid++; - } - } - } else { - if (lclColInd != LINV) { - const size_t offset = - KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, - lclColInd, hint[0], sorted); - if (offset != numInTgtRow) { - tgtRowVals[offset] = newVals; - hint[0] = offset + 1; - numValid++; - } - } - } - return numValid; - }; - - if (sourceIsLocallyIndexed) { - - const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); - const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); - - typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); - typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); - typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); - typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); - typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); - - bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); - - using local_map_type = typename crs_matrix_type::map_type::local_map_type; - - local_map_type local_map = srcMat.getRowMap()->getLocalMap(); - local_map_type local_col_map = srcMat.getColMap()->getLocalMap(); - local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); - local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); - - auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); - auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); - - Kokkos::parallel_for - ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", - range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID) - { - local_inds_device_value_t start = srcLocalRowPtrsDevice(sourceLID); - local_inds_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); - local_inds_device_value_t rowLength = (end - start); - - local_inds_device_value_t tstart = tgtLocalRowPtrsDevice(sourceLID); - local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); - local_inds_device_value_t numInTgtRow = (tend - tstart); - - KOKKOS_ASSERT(static_cast(tstart) < tvals.extent(0)); - impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); - const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); - - size_t hint=0; - for (LO j = 0; j < rowLength; j++) { - local_inds_device_value_t ci = srcLocalColIndsDevice(start + j); - GO gi = local_col_map.getGlobalElement(ci); - const local_inds_device_value_t lclColInd = tgt_local_col_map.getLocalElement(gi); - my_replaceGlobalValuesImpl_scalar(tgtMatIsSorted, false, &hint, - numInTgtRow, tgtColInds, tgtRowVals, - lclColInd, vals(start+j)); - } - - }); // kokkos parallel_for - - } else { - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - - Teuchos::ArrayView rowIndsConstView; - Teuchos::ArrayView rowValsConstView; - - typename crs_matrix_type::global_inds_host_view_type rowIndsView; - typename crs_matrix_type::values_host_view_type rowValsView; - srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - // FIXME - need to apply the same approach as above, maybe reuse my_replaceGlobalValuesImpl_scalar? - tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, - rowValsConstView); - } - } - - // FIXME - need to apply the same approach as above to the permutes - - // - // "Permute" part of "copy and permute." - // - typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; - typename crs_matrix_type::nonconst_values_host_view_type rowVals; - - const auto& tgtRowMap = * (tgtMat.getRowMap ()); - for (size_t p = 0; p < numPermutes; ++p) { - const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); - const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); - - Teuchos::ArrayView rowIndsConstView; - Teuchos::ArrayView rowValsConstView; - - if (sourceIsLocallyIndexed) { - const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.size ())) { - Kokkos::resize(rowInds,rowLength); - Kokkos::resize(rowVals,rowLength); - } - // Resizing invalidates an Array's views, so we must make new - // ones, even if rowLength hasn't changed. - typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); - - // The source matrix is locally indexed, so we have to get a - // copy. Really it's the GIDs that have to be copied (because - // they have to be converted from LIDs). - size_t checkRowLength = 0; - srcMat.getGlobalRowCopy(sourceGID, rowIndsView, - rowValsView, checkRowLength); - - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - } - else { - typename crs_matrix_type::global_inds_host_view_type rowIndsView; - typename crs_matrix_type::values_host_view_type rowValsView; - srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - } - - tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, - rowValsConstView); - } - - } - } // namespace Tpetra // From 38ba36a1005ef6809e7bda3b0542f2ed80976009 Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Wed, 29 Jan 2025 17:25:06 -0700 Subject: [PATCH 4/7] fix PR compile error Signed-off-by: Tim Fuller --- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 0b9fb9cb05d6..3b67a2dae905 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -6086,7 +6086,7 @@ CrsMatrix:: local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); local_inds_device_value_t numInTgtRow = (tend - tstart); - KOKKOS_ASSERT(tstart < tvals.extent(0)); + KOKKOS_ASSERT(tstart - tvals.extent(0) == 0); impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); From d13948590ef6bfc40d5086eeb0d9e6776985464d Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Thu, 30 Jan 2025 14:18:10 -0700 Subject: [PATCH 5/7] fix kokkos_assert checking a CRS start index Signed-off-by: Tim Fuller --- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 3b67a2dae905..99f7d23f683d 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -6086,7 +6086,7 @@ CrsMatrix:: local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); local_inds_device_value_t numInTgtRow = (tend - tstart); - KOKKOS_ASSERT(tstart - tvals.extent(0) == 0); + KOKKOS_ASSERT(static_case(tstart) < tvals.extent(0)); impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); From 32818722a227f7f82160b91fedb927b642875e19 Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Thu, 30 Jan 2025 21:51:19 -0700 Subject: [PATCH 6/7] fix type static_case -> static_case Signed-off-by: Tim Fuller --- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 99f7d23f683d..7f88a841060b 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -6086,7 +6086,7 @@ CrsMatrix:: local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); local_inds_device_value_t numInTgtRow = (tend - tstart); - KOKKOS_ASSERT(static_case(tstart) < tvals.extent(0)); + KOKKOS_ASSERT(static_cast(tstart) < tvals.extent(0)); impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); From 6b87bc446813aaa88a6f88be2b5a4800d10aa2e1 Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Fri, 14 Feb 2025 07:21:55 -0700 Subject: [PATCH 7/7] remove fence Signed-off-by: Tim Fuller --- packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index ca3bb53ada34..c33bda62acb1 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -7608,7 +7608,6 @@ namespace Tpetra { return size_t(0); } ); - Kokkos::fence("here 10"); Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); tgtCrsGraph.setLocallyModified(); }