diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 85f91bd676c2..8949499e695f 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1194,6 +1194,26 @@ namespace Tpetra { buffer_device_type>& permuteFromLIDs, const CombineMode CM) override; + + void + insertGlobalIndicesDevice + (const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd); + + void + copyAndPermuteImpl + (const row_graph_type& source, + row_graph_type& target, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM); + using padding_type = Details::CrsPadding< local_ordinal_type, global_ordinal_type>; diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 17fbcbfb9a5d..c33bda62acb1 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -964,6 +964,8 @@ namespace Tpetra { CrsGraph:: getLocalNumEntries () const { + Details::ProfilingRegion regionGLNE("Tpetra::CrsGraph::getLocalNumEntries"); + const char tfecfFuncName[] = "getLocalNumEntries: "; typedef LocalOrdinal LO; @@ -1185,7 +1187,6 @@ namespace Tpetra { CrsGraph:: allocateIndices (const ELocalGlobal lg, const bool verbose) { - using Details::ProfilingRegion; using Teuchos::arcp; using Teuchos::Array; using Teuchos::ArrayRCP; @@ -1196,7 +1197,7 @@ namespace Tpetra { const char tfecfFuncName[] = "allocateIndices: "; const char suffix[] = " Please report this bug to the Tpetra developers."; - ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); + Details::ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); std::unique_ptr prefix; if (verbose) { @@ -1593,6 +1594,8 @@ namespace Tpetra { typedef GlobalOrdinal GO; const char tfecfFuncName[] = "insertIndices: "; + Details::ProfilingRegion regionII("Tpetra::CrsGraph::insertIndices"); + size_t oldNumEnt = 0; if (debug_) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC @@ -1714,12 +1717,15 @@ namespace Tpetra { const char tfecfFuncName[] = "insertGlobalIndicesImpl: "; const LO lclRow = static_cast (rowInfo.localRow); + Details::ProfilingRegion regionIGII("Tpetra::CrsGraph::insertGlobalIndicesImpl"); + auto numEntries = rowInfo.numEntries; using inp_view_type = View; inp_view_type inputInds(inputGblColInds, numInputInds); size_t numInserted; { auto gblIndsHostView = this->gblInds_wdv.getHostView(Access::ReadWrite); + // FIXME - device numInserted = Details::insertCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), gblIndsHostView, numEntries, inputInds, fun); @@ -1776,6 +1782,8 @@ namespace Tpetra { using LO = LocalOrdinal; const char tfecfFuncName[] = "insertLocallIndicesImpl: "; + Details::ProfilingRegion regionILII("Tpetra::CrsGraph::insertLocallIndicesImpl"); + const RowInfo rowInfo = this->getRowInfo(myRow); size_t numNewInds = 0; @@ -1837,6 +1845,8 @@ namespace Tpetra { using Kokkos::MemoryUnmanaged; auto invalidCount = Teuchos::OrdinalTraits::invalid(); + Details::ProfilingRegion regionFGI("Tpetra::CrsGraph::findGlobalIndices"); + using inp_view_type = View; inp_view_type inputInds(indices.getRawPtr(), indices.size()); @@ -1847,10 +1857,18 @@ namespace Tpetra { if (this->colMap_.is_null()) return invalidCount; const auto& colMap = *(this->colMap_); + auto map = [&](GO const gblInd){return colMap.getLocalElement(gblInd);}; - numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), - rowInfo.numEntries, - lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + + if (this->isSorted()) { + numFound = Details::findCrsIndicesSorted(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } else { + numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } } else if (this->isGloballyIndexed()) { @@ -1861,7 +1879,6 @@ namespace Tpetra { return numFound; } - template size_t CrsGraph:: @@ -2313,6 +2330,8 @@ namespace Tpetra { using Teuchos::ArrayView; const char tfecfFuncName[] = "getGlobalRowCopy: "; + Details::ProfilingRegion regionGGRC("Tpetra::CrsGraph::getGlobalRowCopy"); + // This does the right thing (reports an empty row) if the input // row is invalid. const RowInfo rowinfo = getRowInfoFromGlobalRowIndex (globalRow); @@ -2324,17 +2343,15 @@ namespace Tpetra { numEntries = theNumEntries; // first side effect if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { + if (isLocallyIndexed ()) { auto lclInds = getLocalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = colMap_->getGlobalElement (lclInds(j)); - } + bool err = colMap_->getGlobalElements(lclInds.data(), theNumEntries, indices.data()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); } else if (isGloballyIndexed ()) { auto gblInds = getGlobalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = gblInds(j); - } + std::memcpy((void*)indices.data(), (const void*) gblInds.data(), theNumEntries*sizeof(*indices.data())); } } } @@ -2912,6 +2929,8 @@ namespace Tpetra { using size_type = typename Teuchos::Array::size_type; const char tfecfFuncName[] = "globalAssemble: "; // for exception macro + Details::ProfilingRegion regionGA("Tpetra::CrsGraph::globalAssemble"); + std::unique_ptr prefix; if (verbose_) { prefix = this->createPrefix("CrsGraph", "globalAssemble"); @@ -3163,6 +3182,8 @@ namespace Tpetra { const char tfecfFuncName[] = "fillComplete: "; const bool verbose = verbose_; + Details::ProfilingRegion regionFC("Tpetra::CrsGraph::fillComplete"); + std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "fillComplete"); @@ -3531,6 +3552,8 @@ namespace Tpetra { "expertStaticFillComplete): "; const size_t lclNumRows = this->getLocalNumRows (); + Details::ProfilingRegion regionFLG("Tpetra::CrsGraph::fillLocalGraph"); + // This method's goal is to fill in the two arrays (compressed // sparse row format) that define the sparse graph's structure. @@ -4802,116 +4825,9 @@ namespace Tpetra { using LO = local_ordinal_type; using GO = global_ordinal_type; using this_CRS_type = CrsGraph; - const char tfecfFuncName[] = "copyAndPermute: "; - const bool verbose = verbose_; - - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("CrsGraph", "copyAndPermute"); - std::ostringstream os; - os << *prefix << endl; - std::cerr << os.str (); - } - - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), - std::runtime_error, "permuteToLIDs.extent(0) = " - << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " - << permuteFromLIDs.extent (0) << "."); - - // We know from checkSizes that the source object is a - // row_graph_type, so we don't need to check again. - const row_graph_type& srcRowGraph = - dynamic_cast (source); - - if (verbose) { - std::ostringstream os; - os << *prefix << "Compute padding" << endl; - std::cerr << os.str (); - } - auto padding = computeCrsPadding(srcRowGraph, numSameIDs, - permuteToLIDs, permuteFromLIDs, verbose); - applyCrsPadding(*padding, verbose); - - // If the source object is actually a CrsGraph, we can use view - // mode instead of copy mode to access the entries in each row, - // if the graph is not fill complete. - const this_CRS_type* srcCrsGraph = - dynamic_cast (&source); - - const map_type& srcRowMap = *(srcRowGraph.getRowMap()); - const map_type& tgtRowMap = *(getRowMap()); - const bool src_filled = srcRowGraph.isFillComplete(); - nonconst_global_inds_host_view_type row_copy; - LO myid = 0; - - // - // "Copy" part of "copy and permute." - // - if (src_filled || srcCrsGraph == nullptr) { - if (verbose) { - std::ostringstream os; - os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; - std::cerr << os.str (); - } - // If the source graph is fill complete, we can't use view mode, - // because the data might be stored in a different format not - // compatible with the expectations of view mode. Also, if the - // source graph is not a CrsGraph, we can't use view mode, - // because RowGraph only provides copy mode access to the data. - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); - this->insertGlobalIndices (gid, row_length, row_copy.data()); - } - } else { - if (verbose) { - std::ostringstream os; - os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; - std::cerr << os.str (); - } - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - global_inds_host_view_type row; - srcCrsGraph->getGlobalRowView (gid, row); - this->insertGlobalIndices (gid, row.extent(0), row.data()); - } - } - - // - // "Permute" part of "copy and permute." - // - auto permuteToLIDs_h = permuteToLIDs.view_host (); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); - - if (src_filled || srcCrsGraph == nullptr) { - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); - this->insertGlobalIndices (mygid, row_length, row_copy.data()); - } - } else { - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - global_inds_host_view_type row; - srcCrsGraph->getGlobalRowView (srcgid, row); - this->insertGlobalIndices (mygid, row.extent(0), row.data()); - } - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Done" << endl; - std::cerr << os.str (); - } + const row_graph_type& srcRowGraph = dynamic_cast (source); + copyAndPermuteImpl(srcRowGraph, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, INSERT); + return; } template @@ -5137,6 +5053,7 @@ namespace Tpetra { std::vector tgtGblColIndsScratch; execute_sync_host_uvm_access(); // protect host UVM access + // FIXME parallel_for for (LO lclRowInd = 0; lclRowInd < numSameIDs; ++lclRowInd) { const GO srcGblRowInd = srcRowMap.getGlobalElement(lclRowInd); const GO tgtGblRowInd = tgtRowMap.getGlobalElement(lclRowInd); @@ -7576,6 +7493,258 @@ namespace Tpetra { return output; } + template + void + CrsGraph:: + insertGlobalIndicesDevice(const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd) + { + using crs_graph_type = CrsGraph; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; + typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); + + const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); + const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); + + using local_map_type = typename crs_graph_type::map_type::local_map_type; + local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); + local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); + local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); + + auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); + auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); + auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); + auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); + + typename crs_graph_type::num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; + + auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); + + const bool sorted = false; + + bool hasMap = permuteFromLIDs.extent(0) > 0; + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + +#ifdef CRSGRAPH_INNER_ABORT +#undef CRSGRAPH_INNER_ABORT +#endif + +#define CRSGRAPH_INNER_ABORT(lin) do { \ + printf("ERROR: Tpetra_CrsGraph_def.hpp:%d", lin); \ + Kokkos::abort("error"); \ + } while(0) + + Kokkos::parallel_for( + "Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, loopEnd), + KOKKOS_LAMBDA(const LO sourceLID) { + auto srcLid = sourceLID; + auto tgtLid = sourceLID; + if (hasMap) { + srcLid = permuteFromLIDs_d(srcLid); + tgtLid = permuteToLIDs_d(tgtLid); + } + auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); + if (srcGid == GINV) + CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); + + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); + if (tgtLocalRow == LINV) + CRSGRAPH_INNER_ABORT(__LINE__); + if (tgtLocalRow != tgtLid) + CRSGRAPH_INNER_ABORT(__LINE__); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + auto start = srcLocalRowPtrsDevice(srcLid); + auto end = srcLocalRowPtrsDevice(srcLid + 1); + auto rowLength = (end - start); + + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); + auto tend = tstart + tgtNumEntries; + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + size_t num_inserted = 0; + + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); + + size_t hint=0; + for (size_t j = 0; j < rowLength; j++) { + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) + CRSGRAPH_INNER_ABORT(__LINE__); + auto numInTgtRow = (tend - tstart); + + const size_t offset = KokkosSparse::findRelOffset( + tgtGlobalColIndsPtr+tstart, numInTgtRow, gi, hint, sorted + ); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + Kokkos::abort("num_avail"); + } + tgtGlobalColIndsPtr[tstart + offset] = gi; + ++tend; + hint = offset + 1; + ++num_inserted; + } + } + k_numRowEnt(tgtLocalRow) += num_inserted; + + return size_t(0); + } + ); + Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); + tgtCrsGraph.setLocallyModified(); + } + + + template + void + CrsGraph:: + copyAndPermuteImpl( + const row_graph_type& srcRowGraph, + row_graph_type& tgtRowGraph, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM + ) { + using std::endl; + using LO = local_ordinal_type; + using GO = global_ordinal_type; + const char tfecfFuncName[] = "copyAndPermuteImpl: "; + const bool verbose = verbose_; + + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteImpl"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("CrsGraph", "copyAndPermuteImpl"); + std::ostringstream os; + os << *prefix << endl; + std::cerr << os.str (); + } + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), + std::runtime_error, "permuteToLIDs.extent(0) = " + << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " + << permuteFromLIDs.extent (0) << "."); + + if (verbose) { + std::ostringstream os; + os << *prefix << "Compute padding" << endl; + std::cerr << os.str (); + } + + using crs_graph_type = CrsGraph; + const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + !srcCrsGraphPtr, std::runtime_error, "error srcGraph type= " << typeid(srcRowGraph).name() + ); + const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; + + crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + !srcCrsGraphPtr, std::runtime_error, "error tgtGraph type= " << typeid(tgtRowGraph).name() + ); + + crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; + + auto padding = tgtCrsGraph.computeCrsPadding( + srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose + ); + + tgtCrsGraph.applyCrsPadding(*padding, verbose); + + const map_type& srcRowMap = *(srcRowGraph.getRowMap()); + const map_type& tgtRowMap = *(tgtRowGraph.getRowMap()); + const bool src_filled = srcRowGraph.isFillComplete(); + nonconst_global_inds_host_view_type row_copy; + LO myid = 0; + + // + // "Copy" part of "copy and permute." + // + LO numSameIDs_as_LID = static_cast(numSameIDs); + + if (src_filled || srcCrsGraphPtr == nullptr) { + if (verbose) { + std::ostringstream os; + os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; + std::cerr << os.str (); + } + // If the source graph is fill complete, we can't use view mode, + // because the data might be stored in a different format not + // compatible with the expectations of view mode. Also, if the + // source graph is not a CrsGraph, we can't use view mode, + // because RowGraph only provides copy mode access to the data. + Kokkos::DualView noPermute; + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, noPermute, noPermute, numSameIDs_as_LID); + } else { + if (verbose) { + std::ostringstream os; + os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; + std::cerr << os.str (); + } + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (gid, row); + tgtCrsGraph.insertGlobalIndices (gid, row.extent(0), row.data()); + } + } + + // + // "Permute" part of "copy and permute." + // + auto permuteToLIDs_h = permuteToLIDs.view_host (); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + + if (src_filled || srcCrsGraphPtr == nullptr) { + // note reversed arg order, tgt, then src + insertGlobalIndicesDevice( + srcCrsGraph, + tgtCrsGraph, + permuteToLIDs, + permuteFromLIDs, + static_cast (permuteToLIDs_h.extent (0)) + ); + } else { + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (srcgid, row); + tgtCrsGraph.insertGlobalIndices (mygid, row.extent(0), row.data()); + } + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index f0eef6b3b32e..7f88a841060b 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -49,6 +49,7 @@ #include "KokkosSparse_spmv.hpp" #include +#include #include #include #include @@ -2423,15 +2424,105 @@ namespace Tpetra { const impl_scalar_type newVals[], const LocalOrdinal numElts) { - Teuchos::ArrayView indsT(inds, numElts); - auto fun = - [&](size_t const k, size_t const /*start*/, size_t const offset) { - rowVals[offset] = newVals[k]; - }; - std::function cb(std::ref(fun)); - return graph.findGlobalIndices(rowInfo, indsT, cb); + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + [[maybe_unused]] LocalOrdinal niv=0; + + { // new + typedef LocalOrdinal LO; + typedef GlobalOrdinal GO; + + const bool sorted = graph.isSorted (); + const bool atomic = useAtomicUpdatesByDefault; // FIXME + size_t hint = 0; // guess at the index's relative offset in the row + LO numValid = 0; // number of valid input column indices + + if (graph.isLocallyIndexed ()) { + // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its + // pointer does NOT change its reference count. Thus, this + // code is still thread safe. + if (graph.colMap_.is_null ()) { + // NO input column indices are valid in this case, since if + // the column Map is null on the calling process, then the + // calling process owns no graph entries. + return numValid; + } + const map_type& colMap = * (graph.colMap_); + + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getLocalIndsViewHost (rowInfo); + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset]= newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + } + else if (graph.isGloballyIndexed ()) { + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getGlobalIndsViewHost (rowInfo); + + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset] = newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + // If the graph is neither locally nor globally indexed on the + // calling process, that means the calling process has no graph + // entries. Thus, none of the input column indices are valid. + return numValid; + } + return LINV; } + template LocalOrdinal CrsMatrix:: @@ -2466,8 +2557,8 @@ namespace Tpetra { return Teuchos::OrdinalTraits::invalid (); } const crs_graph_type& graph = * (this->staticGraph_); - const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow); + if (rowInfo.localRow == Teuchos::OrdinalTraits::invalid ()) { // The input local row is invalid on the calling process, // which means that the calling process summed 0 entries. @@ -2475,6 +2566,7 @@ namespace Tpetra { } auto curRowVals = this->getValuesViewHostNonConst (rowInfo); + const IST* const inVals = reinterpret_cast (inputVals); return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo, inputGblColInds, inVals, numEnt); @@ -3235,11 +3327,10 @@ CrsMatrix:: const map_type& colMap = * (staticGraph_->colMap_); auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo); auto curVals = getValuesViewHost(rowinfo); - - for (size_t j = 0; j < theNumEntries; ++j) { - values[j] = curVals[j]; - indices[j] = colMap.getGlobalElement (curLclInds(j)); - } + bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, indices.data()); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(err, std::runtime_error, "getGlobalElements error"); + // FIXME - this should/could be a kokkos deep copy? + std::memcpy((void*)values.data(), (const void*) curVals.data(), numEntries*sizeof(*values.data())); } else if (staticGraph_->isGloballyIndexed ()) { auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo); @@ -3469,7 +3560,7 @@ CrsMatrix:: setAllValues ( const local_matrix_device_type& localDeviceMatrix) { using ProfilingRegion=Details::ProfilingRegion; - ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix"); + ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues1 from KokkosSparse::CrsMatrix"); auto graph = localDeviceMatrix.graph; //FIXME how to check whether graph is allocated @@ -3495,7 +3586,7 @@ CrsMatrix:: typedef impl_scalar_type IST; typedef typename local_graph_device_type::row_map_type row_map_type; //typedef typename row_map_type::non_const_value_type row_offset_type; - const char tfecfFuncName[] = "setAllValues(ArrayRCP, ArrayRCP, ArrayRCP): "; + const char tfecfFuncName[] = "setAllValues2(ArrayRCP, ArrayRCP, ArrayRCP): "; // The row offset type may depend on the execution space. It may // not necessarily be size_t. If it's not, we need to make a deep @@ -5654,6 +5745,16 @@ CrsMatrix:: myGraph_->setRowPtrsUnpacked(row_ptr_beg); } + template + void copyAndPermuteStaticGraphImpl( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes + ); + template void CrsMatrix:: @@ -5663,6 +5764,22 @@ CrsMatrix:: const LocalOrdinal permuteToLIDs[], const LocalOrdinal permuteFromLIDs[], const size_t numPermutes) + { + copyAndPermuteStaticGraphImpl( + srcMat, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, numPermutes + ); + return; + } + + template + void + CrsMatrix:: + copyAndPermuteNonStaticGraph( + const RowMatrix& srcMat, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs_dv, + const Kokkos::DualView& permuteFromLIDs_dv, + const size_t numPermutes) { using Details::ProfilingRegion; using Teuchos::Array; @@ -5670,11 +5787,11 @@ CrsMatrix:: using std::endl; using LO = LocalOrdinal; using GO = GlobalOrdinal; - const char tfecfFuncName[] = "copyAndPermuteStaticGraph"; + const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph"; const char suffix[] = " Please report this bug to the Tpetra developers."; ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph"); + ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph"); const bool debug = Details::Behavior::debug("CrsGraph"); const bool verbose = Details::Behavior::verbose("CrsGraph"); @@ -5687,15 +5804,25 @@ CrsMatrix:: const char* const prefix_raw = verbose ? prefix.get()->c_str() : nullptr; + { + using row_graph_type = RowGraph; + const row_graph_type& srcGraph = *(srcMat.getGraph()); + auto padding = + myGraph_->computeCrsPadding(srcGraph, numSameIDs, + permuteToLIDs_dv, permuteFromLIDs_dv, verbose); + applyCrsPadding(*padding, verbose); + } const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // const map_type& srcRowMap = * (srcMat.getRowMap ()); - nonconst_global_inds_host_view_type rowInds; - nonconst_values_host_view_type rowVals; const LO numSameIDs_as_LID = static_cast (numSameIDs); + using gids_type = nonconst_global_inds_host_view_type; + using vals_type = nonconst_values_host_view_type; + gids_type rowInds; + vals_type rowVals; for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { // Global ID for the current row index in the source matrix. // The first numSameIDs GIDs in the two input lists are the @@ -5703,52 +5830,43 @@ CrsMatrix:: const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); const GO targetGID = sourceGID; - ArrayViewrowIndsConstView; + ArrayView rowIndsConstView; ArrayView rowValsConstView; if (sourceIsLocallyIndexed) { + const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.size())) { + if (rowLength > static_cast (rowInds.extent(0))) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because // they have to be converted from LIDs). size_t checkRowLength = 0; - srcMat.getGlobalRowCopy (sourceGID, rowIndsView, - rowValsView, checkRowLength); + srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, + checkRowLength); if (debug) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, "For " + (rowLength != checkRowLength, std::logic_error, ": For " "global row index " << sourceGID << ", the source " "matrix's getNumEntriesInGlobalRow returns a row length " "of " << rowLength << ", but getGlobalRowCopy reports " "a row length of " << checkRowLength << "." << suffix); } - - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface + rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); + rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); } else { // source matrix is globally indexed. global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews @@ -5761,20 +5879,19 @@ CrsMatrix:: Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface - } - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); + // Combine the data into the target matrix. + insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, + rowValsConstView, prefix_raw, debug, verbose); } if (verbose) { std::ostringstream os; os << *prefix << "Do permutes" << endl; } + const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data(); + const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data(); const map_type& tgtRowMap = * (this->getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { @@ -5786,14 +5903,14 @@ CrsMatrix:: if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.size ())) { + if (rowLength > static_cast (rowInds.extent(0))) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because @@ -5809,24 +5926,14 @@ CrsMatrix:: rowLength << ", but getGlobalRowCopy a row length of " << checkRowLength << "." << suffix); } - - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface + rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); + rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); } else { global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews @@ -5841,9 +5948,9 @@ CrsMatrix:: // KDDKDD UVM TEMPORARY: KokkosView interface } - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); + // Combine the data into the target matrix. + insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, + rowValsConstView, prefix_raw, debug, verbose); } if (verbose) { @@ -5853,145 +5960,209 @@ CrsMatrix:: } template - void - CrsMatrix:: - copyAndPermuteNonStaticGraph( + void copyAndPermuteStaticGraphImpl( const RowMatrix& srcMat, + RowMatrix& tgtMat, const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs_dv, - const Kokkos::DualView& permuteFromLIDs_dv, - const size_t numPermutes) - { + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes + ) { using Details::ProfilingRegion; using Teuchos::Array; - using Teuchos::ArrayView; + //using Teuchos::ArrayView; using std::endl; using LO = LocalOrdinal; using GO = GlobalOrdinal; - const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph"; - const char suffix[] = - " Please report this bug to the Tpetra developers."; + + using impl_scalar_type = typename Kokkos::ArithTraits::val_type; + + using crs_matrix_type = CrsMatrix; + + typedef typename crs_matrix_type::local_inds_device_view_type::non_const_value_type local_inds_device_value_t; + typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; + + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + const char tfecfFuncName[] = "copyAndPermuteStaticGraphImpl"; ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph"); + ("Tpetra::CrsMatrix::copyAndPermuteStaticGraphImpl"); - const bool debug = Details::Behavior::debug("CrsGraph"); - const bool verbose = Details::Behavior::verbose("CrsGraph"); - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("CrsGraph", tfecfFuncName); - std::ostringstream os; - os << *prefix << "Start" << endl; - } - const char* const prefix_raw = - verbose ? prefix.get()->c_str() : nullptr; + // const bool debug = Details::Behavior::debug("CrsGraph"); + // const bool verbose = Details::Behavior::verbose("CrsGraph"); + + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + TEUCHOS_TEST_FOR_EXCEPTION(srcMatCrsPtr == nullptr, std::runtime_error, "bad srcMatCrsPtr"); + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); + TEUCHOS_TEST_FOR_EXCEPTION(tgtMatCrsPtr == nullptr, std::runtime_error, "bad tgtMatCrsPtr"); + crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; + + std::string prefix = tfecfFuncName; + // const char* const prefix_raw = prefix.c_str(); - { - using row_graph_type = RowGraph; - const row_graph_type& srcGraph = *(srcMat.getGraph()); - auto padding = - myGraph_->computeCrsPadding(srcGraph, numSameIDs, - permuteToLIDs_dv, permuteFromLIDs_dv, verbose); - applyCrsPadding(*padding, verbose); - } const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // - const map_type& srcRowMap = * (srcMat.getRowMap ()); - const LO numSameIDs_as_LID = static_cast (numSameIDs); - using gids_type = nonconst_global_inds_host_view_type; - using vals_type = nonconst_values_host_view_type; - gids_type rowInds; - vals_type rowVals; - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; + const auto& srcRowMap = * (srcMat.getRowMap ()); + auto comm = srcRowMap.getComm(); - ArrayView rowIndsConstView; - ArrayView rowValsConstView; + const LO numSameIDs_as_LID = static_cast (numSameIDs); - if (sourceIsLocallyIndexed) { + auto my_replaceGlobalValuesImpl_scalar + = KOKKOS_LAMBDA( + const bool sorted, const bool atomic, size_t hint[], + const size_t numInTgtRow, const local_inds_device_value_t tgtColInds[], impl_scalar_type tgtRowVals[], + const local_inds_device_value_t lclColInd, const impl_scalar_type newVals + ) -> LO + { + LO numValid = 0; // number of valid input column indices + + if (atomic) { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + Kokkos::atomic_store (&tgtRowVals[offset], newVals); + hint[0] = offset + 1; + numValid++; + } + } + } else { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + tgtRowVals[offset] = newVals; + hint[0] = offset + 1; + numValid++; + } + } + } + return numValid; + }; + + if (sourceIsLocallyIndexed) { + + const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); + + typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); + typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); + typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); + + bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); + + using local_map_type = typename crs_matrix_type::map_type::local_map_type; + + local_map_type local_map = srcMat.getRowMap()->getLocalMap(); + local_map_type local_col_map = srcMat.getColMap()->getLocalMap(); + local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); + local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); + + auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); + auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); - const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.extent(0))) { - Kokkos::resize(rowInds,rowLength); - Kokkos::resize(rowVals,rowLength); - } - // Resizing invalidates an Array's views, so we must make new - // ones, even if rowLength hasn't changed. - gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraphImpl", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + local_inds_device_value_t start = srcLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); + local_inds_device_value_t rowLength = (end - start); + + local_inds_device_value_t tstart = tgtLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); + local_inds_device_value_t numInTgtRow = (tend - tstart); + + KOKKOS_ASSERT(static_cast(tstart) < tvals.extent(0)); + impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); + const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); + + size_t hint=0; + for (LO j = 0; j < rowLength; j++) { + local_inds_device_value_t ci = srcLocalColIndsDevice(start + j); + GO gi = local_col_map.getGlobalElement(ci); + const local_inds_device_value_t lclColInd = tgt_local_col_map.getLocalElement(gi); + my_replaceGlobalValuesImpl_scalar(tgtMatIsSorted, false, &hint, + numInTgtRow, tgtColInds, tgtRowVals, + lclColInd, vals(start+j)); + } + + }); // kokkos parallel_for - // The source matrix is locally indexed, so we have to get a - // copy. Really it's the GIDs that have to be copied (because - // they have to be converted from LIDs). - size_t checkRowLength = 0; - srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, - checkRowLength); - if (debug) { - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, ": For " - "global row index " << sourceGID << ", the source " - "matrix's getNumEntriesInGlobalRow returns a row length " - "of " << rowLength << ", but getGlobalRowCopy reports " - "a row length of " << checkRowLength << "." << suffix); - } - rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); - rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); - } - else { // source matrix is globally indexed. - global_inds_host_view_type rowIndsView; - values_host_view_type rowValsView; + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface - } - // Combine the data into the target matrix. - insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, - rowValsConstView, prefix_raw, debug, verbose); + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + // FIXME - need to apply the same approach as above, maybe reuse my_replaceGlobalValuesImpl_scalar? + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } } - if (verbose) { - std::ostringstream os; - os << *prefix << "Do permutes" << endl; - } - const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data(); - const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data(); + // FIXME - need to apply the same approach as above to the permutes - const map_type& tgtRowMap = * (this->getRowMap ()); + // + // "Permute" part of "copy and permute." + // + typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; + typename crs_matrix_type::nonconst_values_host_view_type rowVals; + + const auto& tgtRowMap = * (tgtMat.getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); - ArrayView rowIndsConstView; - ArrayView rowValsConstView; + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.extent(0))) { + if (rowLength > static_cast (rowInds.size ())) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because @@ -5999,45 +6170,42 @@ CrsMatrix:: size_t checkRowLength = 0; srcMat.getGlobalRowCopy(sourceGID, rowIndsView, rowValsView, checkRowLength); - if (debug) { - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (rowLength != checkRowLength, std::logic_error, "For " - "source matrix global row index " << sourceGID << ", " - "getNumEntriesInGlobalRow returns a row length of " << - rowLength << ", but getGlobalRowCopy a row length of " - << checkRowLength << "." << suffix); - } - rowIndsConstView = Teuchos::ArrayView(rowIndsView.data(), rowLength); - rowValsConstView = Teuchos::ArrayView(reinterpret_cast(rowValsView.data()), rowLength); + + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface } else { - global_inds_host_view_type rowIndsView; - values_host_view_type rowValsView; + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface } - // Combine the data into the target matrix. - insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView, - rowValsConstView, prefix_raw, debug, verbose); + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); } - if (verbose) { - std::ostringstream os; - os << *prefix << "Done" << endl; - } } template @@ -6092,15 +6260,13 @@ CrsMatrix:: using RMT = RowMatrix; const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { - TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); - auto permuteToLIDs_h = permuteToLIDs.view_host (); - TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); - - copyAndPermuteStaticGraph(srcMat, numSameIDs, - permuteToLIDs_h.data(), - permuteFromLIDs_h.data(), - numPermute); + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + copyAndPermuteStaticGraph( + srcMat, numSameIDs, permuteToLIDs_d.data(), permuteFromLIDs_d.data(), numPermute + ); } else { copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs, @@ -6819,17 +6985,17 @@ CrsMatrix:: const bool verbose) { const char tfecfFuncName[] = "combineGlobalValues: "; - - if (isStaticGraph ()) { + const bool isg = isStaticGraph (); + if (isg) { // INSERT doesn't make sense for a static graph, since you // aren't allowed to change the structure of the graph. // However, all the other combine modes work. - if (combineMode == ADD) { - sumIntoGlobalValues (globalRowIndex, columnIndices, values); - } - else if (combineMode == REPLACE) { + if (combineMode == REPLACE) { replaceGlobalValues (globalRowIndex, columnIndices, values); } + else if (combineMode == ADD) { + sumIntoGlobalValues (globalRowIndex, columnIndices, values); + } else if (combineMode == ABSMAX) { using ::Tpetra::Details::AbsMax; AbsMax f; @@ -7053,6 +7219,7 @@ CrsMatrix:: } if (isStaticGraph ()) { + using Details::unpackCrsMatrixAndCombineNew; unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID, importLIDs, constantNumPackets, @@ -8842,6 +9009,7 @@ CrsMatrix:: #ifdef HAVE_TPETRA_MMM_TIMINGS Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries"))); #endif + Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d); diff --git a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp index cc17543ec7c1..24072bc0996b 100644 --- a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp @@ -624,7 +624,15 @@ class WrappedDualView { // We check to see if the memory is not aliased *or* if it is a supported // (heterogeneous memory) accelerator (for shared host/device memory). - return !memoryIsAliased() || Spaces::is_gpu_exec_space(); + if constexpr(Spaces::is_gpu_exec_space()) { + return true; + } else { + if constexpr(!deviceMemoryIsHostAccessible) { + return true; + } else { + return dualView.h_view.data() != dualView.d_view.data(); + } + } } diff --git a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp index 4496f17eb12a..05d484574224 100644 --- a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp @@ -20,6 +20,7 @@ #include #include #include +#include /// \file Tpetra_Details_crsUtils.hpp /// \brief Functions for manipulating CRS arrays @@ -422,8 +423,8 @@ insert_crs_indices( if (idx == cur_indices[row_offset]) { break; } - } - + } + if (row_offset == end) { if (num_inserted >= num_avail) { // not enough room return Teuchos::OrdinalTraits::invalid(); @@ -499,22 +500,67 @@ find_crs_indices( size_t num_found = 0; for (size_t k = 0; k < new_indices.size(); k++) { - auto row_offset = start; auto idx = std::forward(map)(new_indices[k]); if (idx == invalid_ordinal) continue; - for (; row_offset < end; row_offset++) + for (size_t row_offset = start; row_offset < end; row_offset++) { - if (idx == cur_indices[row_offset]) + size_t off = row_offset - start; + auto lidx = cur_indices[row_offset]; + if (idx == lidx) { - std::forward(cb)(k, start, row_offset - start); + std::forward(cb)(k, start, off); num_found++; + // FIXME why no break here, can an index be found twice? + // break; } } } return num_found; } +/// \brief Implementation of findCrsIndices +template +size_t +find_crs_indices_sorted( + typename Pointers::value_type const row, + Pointers const& row_ptrs, + const size_t curNumEntries, + Indices1 const& cur_indices, + Indices2 const& new_indices, + IndexMap&& map, + Callback&& cb) +{ + if (new_indices.size() == 0) + return 0; + + using ordinal = + typename std::remove_const::type; + auto invalid_ordinal = Teuchos::OrdinalTraits::invalid(); + + const size_t start = static_cast (row_ptrs[row]); + const size_t end = start + curNumEntries; + size_t num_found = 0; + for (size_t k = 0; k < new_indices.size(); k++) + { + auto idx = std::forward(map)(new_indices[k]); + if (idx == invalid_ordinal) + continue; + + // FIXME use kokkos findRelOffset + auto first = &cur_indices[start]; + auto first0 = first; + auto last = &cur_indices[end]; + first = std::lower_bound(first, last, idx); + size_t off = first - first0; + if (first != last && !(idx < *first)) { + std::forward(cb)(k, start, off); + num_found++; + } + } + return num_found; +} + } // namespace impl @@ -718,6 +764,20 @@ findCrsIndices( return impl::find_crs_indices(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); } +template +size_t +findCrsIndicesSorted( + typename Pointers::value_type const row, + Pointers const& rowPtrs, + const size_t curNumEntries, + Indices1 const& curIndices, + Indices2 const& newIndices, + IndexMap&& map, + Callback&& cb) +{ + return impl::find_crs_indices_sorted(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); +} + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Map_decl.hpp b/packages/tpetra/core/src/Tpetra_Map_decl.hpp index 2ea5ee6f343e..4b406405e9f9 100644 --- a/packages/tpetra/core/src/Tpetra_Map_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_decl.hpp @@ -642,6 +642,7 @@ namespace Tpetra { /// the same value as /// Teuchos::OrdinalTraits::invalid(). global_ordinal_type getGlobalElement (local_ordinal_type localIndex) const; + bool getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const; /// \brief Get the LocalMap for Kokkos-Kernels. /// diff --git a/packages/tpetra/core/src/Tpetra_Map_def.hpp b/packages/tpetra/core/src/Tpetra_Map_def.hpp index c6b028e8f616..f697ab3aa8bb 100644 --- a/packages/tpetra/core/src/Tpetra_Map_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_def.hpp @@ -1066,7 +1066,7 @@ namespace Tpetra { // beginning of the range starts with the first entry. While // doing so, fill in the LID -> GID table. typename decltype (lgMap_)::non_const_type lgMap - (view_alloc ("lgMap", WithoutInitializing), numLocalElements_); + (view_alloc ("lgMap2", WithoutInitializing), numLocalElements_); // Because you can't use lambdas in constructors on CUDA. Or using private/protected data. // DEEP_COPY REVIEW - DEVICE-TO-DEVICE @@ -1274,6 +1274,40 @@ namespace Tpetra { } } + template + bool + Map:: + getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const + { + auto const minGI = getMinGlobalIndex(); + auto const minLI = getMinLocalIndex(); + auto const maxLI = getMaxLocalIndex(); + if (isContiguous ()) { + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = minGI + lclInd; + } + } + else { + // This is a host Kokkos::View access, with no RCP or ArrayRCP + // involvement. As a result, it is thread safe. + // + // lgMapHost_ is a host pointer; this does NOT assume UVM. + lazyPushToHost(); + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = lgMapHost_[lclInd]; + } + } + return false; + } + template bool Map:: @@ -1662,7 +1696,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap3", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl; @@ -1749,7 +1783,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap4", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl;