From c09cc01f90c1761bd2e0968bb0540c4da051548f Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 9 May 2023 17:56:12 +0200 Subject: [PATCH 01/30] #2074: Add test for sparse OfflineLB maps --- tests/unit/lb/test_offlinelb.cc | 87 +++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 981e09d801..eaf1bf6750 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -74,6 +74,10 @@ struct SimCol : vt::Collection { EXPECT_EQ(getIndex().x() / 2, next_node); } } + + void sparseHandler(Msg* m){ + // TODO + } }; TEST_F(TestOfflineLB, test_offlinelb_1) { @@ -149,6 +153,89 @@ TEST_F(TestOfflineLB, test_offlinelb_1) { } } +TEST_F(TestOfflineLB, test_offlinelb_2) { + using LBDataHolder = vt::vrt::collection::balance::LBDataHolder; + using ElementIDStruct = vt::vrt::collection::balance::ElementIDStruct; + using LoadSummary = vt::vrt::collection::balance::LoadSummary; + using LBDataRestartReader = vt::vrt::collection::balance::LBDataRestartReader; + + auto const this_node = theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + auto const next_node = (this_node + 1) % num_nodes; + auto const prev_node = this_node - 1 >= 0 ? this_node - 1 : num_nodes - 1; + + std::unordered_map> ids; + int len = 2; + PhaseType num_phases = 7; + for (int i = 0; i < len; i++) { + auto id = elm::ElmIDBits::createCollectionImpl(true, i+1, this_node, this_node); + id.curr_node = this_node; + ids[0].push_back(id); + id.curr_node = next_node; + ids[3].push_back(id); + id.curr_node = prev_node; + ids[6].push_back(id); + } + + for (int i = 0; i < len; i++) { + auto pid = elm::ElmIDBits::createCollectionImpl(true, i+1, prev_node, this_node); + auto nid = elm::ElmIDBits::createCollectionImpl(true, i+1, next_node, this_node); + ids[1].push_back(pid); + ids[2].push_back(pid); + ids[4].push_back(nid); + ids[5].push_back(nid); + } + + LBDataHolder dh; + for (PhaseType i = 0; i < num_phases; i++) { + for (auto&& elm : ids[i]) { + dh.node_data_[i][elm] = LoadSummary{3}; + } + } + + using JSONAppender = util::json::Appender; + std::stringstream stream{std::ios_base::out | std::ios_base::in}; + nlohmann::json metadata, phasesMetadata; + phasesMetadata["count"] = num_phases; + phasesMetadata["skipped"]["list"] = {2}; + phasesMetadata["skipped"]["range"] = {{2,3}}; + phasesMetadata["identical_to_previous"]["list"] = {1}; + phasesMetadata["identical_to_previous"]["range"] = {{5,6}}; + metadata["type"] = "LBDatafile"; + metadata["phases"] = phasesMetadata; + + auto w = std::make_unique( + "phases", metadata, std::move(stream), true + ); + for (PhaseType i = 0; i < num_phases; i++) { + // ignore skipped and identical phases + if(i != 1 && i != 2 && i != 3 && i != 5 && i != 6) { + auto j = dh.toJson(i); + w->addElm(*j); + } + } + stream = w->finish(); + + theConfig()->vt_lb = true; + theConfig()->vt_lb_name = "OfflineLB"; + auto up = LBDataRestartReader::construct(); + curRT->theLBDataReader = up.get(); + theLBDataReader()->readLBDataFromStream(std::move(stream)); + + vt::Index1D range{2*num_nodes}; + auto proxy = vt::makeCollection("simcol") + .bounds(range) + .bulkInsert() + .wait(); + + for (PhaseType i = 0; i < num_phases; i++) { + runInEpochCollective("run sparseHandler", [&]{ + proxy.broadcastCollective(i); + }); + thePhase()->nextPhaseCollective(); + } +} + #endif }}}} /* end namespace vt::tests::unit::lb */ From 178111a5a159f6f9eb3e5f512cdb2d3250016fdf Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 9 May 2023 17:56:57 +0200 Subject: [PATCH 02/30] #2074: Allow for sparse maps in LBDataRestartReader --- .../balance/lb_data_restart_reader.cc | 47 +++++++++++++------ .../balance/lb_data_restart_reader.h | 12 +++++ 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index 91c763a30f..323e7c7677 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -79,19 +79,28 @@ void LBDataRestartReader::startup() { } void LBDataRestartReader::readHistory(LBDataHolder const& lbdh) { - num_phases_ = lbdh.node_data_.size(); + PhaseType last_found_phase = 0; + num_phases_ = lbdh.count_; for (PhaseType phase = 0; phase < num_phases_; phase++) { auto iter = lbdh.node_data_.find(phase); if (iter != lbdh.node_data_.end()) { + last_found_phase = phase; for (auto const& obj : iter->second) { if (obj.first.isMigratable()) { history_[phase].insert(obj.first); } } - } else { - // We assume that all phases are dense all fully specified even if they - // don't change - vtAbort("Could not find data: phases must all be specified"); + } else if(lbdh.identical_phases_.find(phase) != lbdh.identical_phases_.end()) { + // Phase is identical to previous one, fill with data from previous phase + auto last_iter = lbdh.node_data_.find(last_found_phase); + for (auto const& obj : last_iter->second) { + if (obj.first.isMigratable()) { + history_[phase].insert(obj.first); + } + } + } else if(lbdh.skipped_phases_.find(phase) == lbdh.skipped_phases_.end()) { + // Phases which are not present must be specified in metadata of the file + vtAbort("Could not find data: Unspecified phases needs to be present in skipped section of the file metadata"); } } } @@ -155,32 +164,40 @@ void LBDataRestartReader::determinePhasesToMigrate() { local_changed_distro.resize(num_phases_ - 1); auto const this_node = theContext()->getNode(); - runInEpochCollective("LBDataRestartReader::updateLocations", [&]{ - for (PhaseType i = 0; i < num_phases_ - 1; ++i) { - local_changed_distro[i] = history_[i] != history_[i+1]; - if (local_changed_distro[i]) { + PhaseType curr = 0, next; + for (;curr < num_phases_ - 1;) { + // find number of next Phase + for(next = curr + 1; next < num_phases_; ++next) { + if(history_.find(next) != history_.end()) { + break; + } + } + + local_changed_distro[curr] = history_[curr] != history_[next]; + if (local_changed_distro[curr]) { std::set departing, arriving; std::set_difference( - history_[i+1].begin(), history_[i+1].end(), - history_[i].begin(), history_[i].end(), + history_[next].begin(), history_[next].end(), + history_[curr].begin(), history_[curr].end(), std::inserter(arriving, arriving.begin()) ); std::set_difference( - history_[i].begin(), history_[i].end(), - history_[i+1].begin(), history_[i+1].end(), + history_[curr].begin(), history_[curr].end(), + history_[next].begin(), history_[next].end(), std::inserter(departing, departing.begin()) ); for (auto&& d : departing) { - proxy_[d.getHomeNode()].send(this_node, i+1, d); + proxy_[d.getHomeNode()].send(this_node, next, d); } for (auto&& a : arriving) { - proxy_[a.getHomeNode()].send(this_node, i+1, a); + proxy_[a.getHomeNode()].send(this_node, next, a); } } + curr = next; } }); diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 0b90e014a1..d83e7f5fa2 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -131,6 +131,12 @@ struct LBDataRestartReader : runtime::component::Component * \return element assigned to this node */ std::set const& getDistro(PhaseType phase) { + for(; phase < num_phases_; ++phase) { + if(history_.find(phase) != history_.end()) { + break; + } + } + auto iter = history_.find(phase); vtAssert(iter != history_.end(), "Must have a valid phase"); return iter->second; @@ -142,6 +148,12 @@ struct LBDataRestartReader : runtime::component::Component * \param[in] phase the phase to clear */ void clearDistro(PhaseType phase) { + for(; phase < num_phases_; ++phase) { + if(history_.find(phase) != history_.end()) { + break; + } + } + auto iter = history_.find(phase); if (iter != history_.end()) { history_.erase(iter); From 3ea91c83df211f9c652ec616b591d8eded8b5e58 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 16 May 2023 15:49:16 +0200 Subject: [PATCH 03/30] #2074: Update finding next phase for OfflineLB --- .../balance/lb_data_restart_reader.h | 30 +++++++++++-------- .../collection/balance/offlinelb/offlinelb.cc | 2 +- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index d83e7f5fa2..2c2a84419a 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -123,6 +123,24 @@ struct LBDataRestartReader : runtime::component::Component | num_phases_; } + /** + * \brief Find the next phase + * + * \param phase the current phase + * + * \return the next phase + */ + PhaseType findNextPhase(PhaseType phase) const { + auto next = phase + 1; + for(; next < num_phases_; ++next) { + if(history_.find(next) != history_.end()) { + return next; + } + } + vtAssert(history_.find(next) != history_.end(), "Must have a valid phase"); + return next; + } + /** * \brief Get the elements assigned for a given phase * @@ -131,12 +149,6 @@ struct LBDataRestartReader : runtime::component::Component * \return element assigned to this node */ std::set const& getDistro(PhaseType phase) { - for(; phase < num_phases_; ++phase) { - if(history_.find(phase) != history_.end()) { - break; - } - } - auto iter = history_.find(phase); vtAssert(iter != history_.end(), "Must have a valid phase"); return iter->second; @@ -148,12 +160,6 @@ struct LBDataRestartReader : runtime::component::Component * \param[in] phase the phase to clear */ void clearDistro(PhaseType phase) { - for(; phase < num_phases_; ++phase) { - if(history_.find(phase) != history_.end()) { - break; - } - } - auto iter = history_.find(phase); if (iter != history_.end()) { history_.erase(iter); diff --git a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc index 0ed7636aab..d81313a30a 100644 --- a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc +++ b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc @@ -59,7 +59,7 @@ void OfflineLB::runLB(LoadType) { for (auto&& elm : distro) { migrateObjectTo(elm, theContext()->getNode()); } - theLBDataReader()->clearDistro(phase_ + 1); + theLBDataReader()->clearDistro(nextPhase); } }}}} /* end namespace vt::vrt::collection::lb */ From 0e61a0e7af9d003da9fdbd5d7f87b0262935595f Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 16 May 2023 18:30:23 +0200 Subject: [PATCH 04/30] #2074: Create LBDataRestartReader when there is OfflineLB in the config file --- src/vt/runtime/runtime.cc | 13 +++++++++++-- src/vt/vrt/collection/balance/read_lb.cc | 9 +++++++++ src/vt/vrt/collection/balance/read_lb.h | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index f8ad7f36d1..d62b1f9a07 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -424,9 +424,18 @@ bool Runtime::tryFinalize(bool const disable_sig) { } bool Runtime::needLBDataRestartReader() { + using vrt::collection::balance::ReadLBConfig; + #if vt_check_enabled(lblite) - if (true) { - return arg_config_->config_.vt_lb_data_in; + if (arg_config_->config_.vt_lb_data_in) { + auto& config_file = arg_config_->config_.vt_lb_file_name; + if (config_file != "") { + bool const has_spec = ReadLBConfig::openConfig(config_file); + if (has_spec) { + return ReadLBConfig::hasOfflineLB(); + } + } + return false; } else #endif return false; diff --git a/src/vt/vrt/collection/balance/read_lb.cc b/src/vt/vrt/collection/balance/read_lb.cc index f2aa22d20d..a32e99e45b 100644 --- a/src/vt/vrt/collection/balance/read_lb.cc +++ b/src/vt/vrt/collection/balance/read_lb.cc @@ -102,6 +102,15 @@ namespace vt { namespace vrt { namespace collection { namespace balance { } } +/*static*/ bool ReadLBConfig::hasOfflineLB() { + for(auto&& ele : config_exact_) { + if(getLB(ele.first) == LBType::OfflineLB) { + return true; + } + } + return false; +} + /*static*/ ConfigEntry* ReadLBConfig::entry(ConfigIndex const& idx) { // First, search the exact iter config for this iteration: it has the highest // precedence diff --git a/src/vt/vrt/collection/balance/read_lb.h b/src/vt/vrt/collection/balance/read_lb.h index a1fbb6998a..60f4c88dd0 100644 --- a/src/vt/vrt/collection/balance/read_lb.h +++ b/src/vt/vrt/collection/balance/read_lb.h @@ -197,6 +197,7 @@ struct ReadLBConfig { static ConfigIndex numEntries() { return config_mod_.size() + config_exact_.size(); } static ConfigEntry* entry(ConfigIndex const& idx); static LBType getLB(ConfigIndex const& idx); + static bool hasOfflineLB(); static ConfigMapType getModEntries() { return config_mod_; }; static ConfigMapType getExactEntries() {return config_exact_; }; static ParamMapType parseParams(std::vector params); From fa83e75e2906d8d3f20703924cf3928d337c3c0d Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 22 May 2023 15:27:12 +0200 Subject: [PATCH 05/30] #2074: Avoid adding data for identical phases to history --- .../balance/lb_data_restart_reader.cc | 42 ++++++++---------- .../balance/lb_data_restart_reader.h | 43 +++++++++++++------ 2 files changed, 48 insertions(+), 37 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index 323e7c7677..236727e97f 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -87,20 +87,17 @@ void LBDataRestartReader::readHistory(LBDataHolder const& lbdh) { last_found_phase = phase; for (auto const& obj : iter->second) { if (obj.first.isMigratable()) { - history_[phase].insert(obj.first); + if (history_[phase] == nullptr) { + history_[phase] = std::make_shared>(); + } + history_[phase]->insert(obj.first); } } } else if(lbdh.identical_phases_.find(phase) != lbdh.identical_phases_.end()) { - // Phase is identical to previous one, fill with data from previous phase - auto last_iter = lbdh.node_data_.find(last_found_phase); - for (auto const& obj : last_iter->second) { - if (obj.first.isMigratable()) { - history_[phase].insert(obj.first); - } - } + // Phase is identical to previous one, use the shared pointer to data from previous phase + addIdenticalPhase(phase, last_found_phase); } else if(lbdh.skipped_phases_.find(phase) == lbdh.skipped_phases_.end()) { - // Phases which are not present must be specified in metadata of the file - vtAbort("Could not find data: Unspecified phases needs to be present in skipped section of the file metadata"); + vtAbort("Could not find data: Skipped phases needs to be listed in file metadata."); } } } @@ -143,12 +140,12 @@ void LBDataRestartReader::arriving(ArriveMsg* msg) { } void LBDataRestartReader::update(UpdateMsg* msg) { - auto iter = history_[msg->phase].find(msg->elm); - vtAssert(iter != history_[msg->phase].end(), "Must exist"); + auto iter = history_[msg->phase]->find(msg->elm); + vtAssert(iter != history_[msg->phase]->end(), "Must exist"); auto elm = *iter; elm.curr_node = msg->curr_node; - history_[msg->phase].erase(iter); - history_[msg->phase].insert(elm); + history_[msg->phase]->erase(iter); + history_[msg->phase]->insert(elm); } void LBDataRestartReader::checkBothEnds(Coord& coord) { @@ -167,26 +164,21 @@ void LBDataRestartReader::determinePhasesToMigrate() { runInEpochCollective("LBDataRestartReader::updateLocations", [&]{ PhaseType curr = 0, next; for (;curr < num_phases_ - 1;) { - // find number of next Phase - for(next = curr + 1; next < num_phases_; ++next) { - if(history_.find(next) != history_.end()) { - break; - } - } + next = findNextPhase(curr); - local_changed_distro[curr] = history_[curr] != history_[next]; + local_changed_distro[curr] = *history_[curr] != *history_[next]; if (local_changed_distro[curr]) { std::set departing, arriving; std::set_difference( - history_[next].begin(), history_[next].end(), - history_[curr].begin(), history_[curr].end(), + history_[next]->begin(), history_[next]->end(), + history_[curr]->begin(), history_[curr]->end(), std::inserter(arriving, arriving.begin()) ); std::set_difference( - history_[curr].begin(), history_[curr].end(), - history_[next].begin(), history_[next].end(), + history_[curr]->begin(), history_[curr]->end(), + history_[next]->begin(), history_[next]->end(), std::inserter(departing, departing.begin()) ); diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 2c2a84419a..7abb6fc529 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -131,14 +131,9 @@ struct LBDataRestartReader : runtime::component::Component * \return the next phase */ PhaseType findNextPhase(PhaseType phase) const { - auto next = phase + 1; - for(; next < num_phases_; ++next) { - if(history_.find(next) != history_.end()) { - return next; - } - } - vtAssert(history_.find(next) != history_.end(), "Must have a valid phase"); - return next; + auto iter = history_.upper_bound(phase); + vtAssert(iter != history_.end(), "Must have a valid phase"); + return iter->first; } /** @@ -146,11 +141,11 @@ struct LBDataRestartReader : runtime::component::Component * * \param[in] phase the phase * - * \return element assigned to this node + * \return pointer to elements assigned to this node, guaranted to be not null */ - std::set const& getDistro(PhaseType phase) { + std::shared_ptr> getDistro(PhaseType phase) const { auto iter = history_.find(phase); - vtAssert(iter != history_.end(), "Must have a valid phase"); + vtAssert(iter != history_.end() && iter->second != nullptr, "Must have a valid phase"); return iter->second; } @@ -166,6 +161,30 @@ struct LBDataRestartReader : runtime::component::Component } } + /** + * \brief Add history for a given phase + * + * \param[in] phase the phase to be added + * \param[in] distro the distribution to be added + */ + void addDistro(PhaseType phase, const std::set& distro) { + if (history_[phase] == nullptr) { + history_[phase] = std::make_shared>(); + } + history_[phase]->insert(distro.begin(), distro.end()); + } + + /** + * \brief Add identical phase to one already present + * + * \param[in] phase the phase to be added + * \param[in] identical the identical phase to be used + */ + void addIdenticalPhase(PhaseType phase, PhaseType identical) { + vtAssert(history_.find(identical) != history_.end(), "Identical phase was not added to history map."); + history_[phase] = history_[identical]; + } + private: /** * \brief Reduce distribution changes globally to find where migrations need @@ -188,7 +207,7 @@ struct LBDataRestartReader : runtime::component::Component std::vector changed_distro_; /// History of mapping that was read in from the data files - std::unordered_map> history_; + std::map>> history_; struct DepartMsg : vt::Message { DepartMsg(NodeType in_depart_node, PhaseType in_phase, ElementIDStruct in_elm) From 2c9831fb31c1a72151d4a1747510bdfab475e805 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 22 May 2023 18:33:35 +0200 Subject: [PATCH 06/30] #2074: Add tests for detecting OfflineLB in LB data file --- src/vt/vrt/collection/balance/read_lb.cc | 15 ++++++--------- src/vt/vrt/collection/balance/read_lb.h | 3 ++- tests/unit/lb/test_lb_reader.nompi.cc | 18 ++++++++++++++++++ tests/unit/lb/test_offlinelb.cc | 3 +-- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/vt/vrt/collection/balance/read_lb.cc b/src/vt/vrt/collection/balance/read_lb.cc index a32e99e45b..d85edf680a 100644 --- a/src/vt/vrt/collection/balance/read_lb.cc +++ b/src/vt/vrt/collection/balance/read_lb.cc @@ -61,6 +61,7 @@ namespace vt { namespace vrt { namespace collection { namespace balance { /*static*/ typename ReadLBConfig::ConfigMapType ReadLBConfig::config_exact_ = {}; /*static*/ std::vector ReadLBConfig::config_prec_ = {}; /*static*/ bool ReadLBConfig::read_complete_ = false; +/*static*/ bool ReadLBConfig::has_offline_lb_ = false; /*static*/ bool ReadLBConfig::openConfig(std::string const& filename) { // No-op if no file specified. Can't be used to clear. @@ -102,15 +103,6 @@ namespace vt { namespace vrt { namespace collection { namespace balance { } } -/*static*/ bool ReadLBConfig::hasOfflineLB() { - for(auto&& ele : config_exact_) { - if(getLB(ele.first) == LBType::OfflineLB) { - return true; - } - } - return false; -} - /*static*/ ConfigEntry* ReadLBConfig::entry(ConfigIndex const& idx) { // First, search the exact iter config for this iteration: it has the highest // precedence @@ -240,6 +232,10 @@ int eatWhitespace(std::ifstream& file) { vtAbort(err_msg); } + if (lb_name == get_lb_names()[LBType::OfflineLB]) { + has_offline_lb_ = true; + } + map->emplace( std::piecewise_construct, std::forward_as_tuple(mod), @@ -252,6 +248,7 @@ int eatWhitespace(std::ifstream& file) { /*static*/ void ReadLBConfig::clear() { read_complete_ = false; + has_offline_lb_ = false; open_filename_ = ""; config_mod_.clear(); config_exact_.clear(); diff --git a/src/vt/vrt/collection/balance/read_lb.h b/src/vt/vrt/collection/balance/read_lb.h index 60f4c88dd0..03d409307f 100644 --- a/src/vt/vrt/collection/balance/read_lb.h +++ b/src/vt/vrt/collection/balance/read_lb.h @@ -197,7 +197,7 @@ struct ReadLBConfig { static ConfigIndex numEntries() { return config_mod_.size() + config_exact_.size(); } static ConfigEntry* entry(ConfigIndex const& idx); static LBType getLB(ConfigIndex const& idx); - static bool hasOfflineLB(); + static bool hasOfflineLB() { return has_offline_lb_; }; static ConfigMapType getModEntries() { return config_mod_; }; static ConfigMapType getExactEntries() {return config_exact_; }; static ParamMapType parseParams(std::vector params); @@ -209,6 +209,7 @@ struct ReadLBConfig { static void readFile(std::string const& filename); static bool read_complete_; + static bool has_offline_lb_; static std::string open_filename_; static ConfigMapType config_mod_; static ConfigMapType config_exact_; diff --git a/tests/unit/lb/test_lb_reader.nompi.cc b/tests/unit/lb/test_lb_reader.nompi.cc index 3451bab7a7..d21fc7277f 100644 --- a/tests/unit/lb/test_lb_reader.nompi.cc +++ b/tests/unit/lb/test_lb_reader.nompi.cc @@ -66,6 +66,7 @@ TEST_F(TestLBReader, test_lb_read_1) { Config::clear(); Config::openConfig(file_name); + EXPECT_EQ(Config::hasOfflineLB(), false); EXPECT_EQ(Config::numEntries(), 3); EXPECT_EQ(Config::getExactEntries().size(), 2); EXPECT_EQ(Config::getModEntries().size(), 1); @@ -120,6 +121,7 @@ TEST_F(TestLBReader, test_lb_read_2) { Config::clear(); Config::openConfig(file_name); + EXPECT_EQ(Config::hasOfflineLB(), false); EXPECT_EQ(Config::numEntries(), 5); for (ConfigIdx i = 0; i < 121; i++) { auto entry = Config::entry(i); @@ -195,4 +197,20 @@ TEST_F(TestLBReader, test_lb_read_2) { EXPECT_EQ(Config::toString(), expected_config); } +TEST_F(TestLBReader, test_lb_read_3_with_offline_lb) { + std::string file_name = "test_lb_read_3_with_offline_lb.txt"; + std::ofstream out(file_name); + out << "" + "0 NoLB\n" + "1 OfflineLB\n" + "%10 OfflineLB\n"; + out.close(); + + using Config = vt::vrt::collection::balance::ReadLBConfig; + + Config::clear(); + Config::openConfig(file_name); + EXPECT_EQ(Config::hasOfflineLB(), true); +} + }}} // end namespace vt::tests::unit diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index eaf1bf6750..161e01fa0f 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -76,7 +76,6 @@ struct SimCol : vt::Collection { } void sparseHandler(Msg* m){ - // TODO } }; @@ -198,7 +197,7 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { nlohmann::json metadata, phasesMetadata; phasesMetadata["count"] = num_phases; phasesMetadata["skipped"]["list"] = {2}; - phasesMetadata["skipped"]["range"] = {{2,3}}; + phasesMetadata["skipped"]["range"] = {{3,3}}; phasesMetadata["identical_to_previous"]["list"] = {1}; phasesMetadata["identical_to_previous"]["range"] = {{5,6}}; metadata["type"] = "LBDatafile"; From 3a3a5879752661f781e26ee2c1a8ef1162685af3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 23 May 2023 13:31:40 +0200 Subject: [PATCH 07/30] #2074: Add checks in sparse handler --- tests/unit/lb/test_offlinelb.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 161e01fa0f..68d5d7dc01 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -76,6 +76,15 @@ struct SimCol : vt::Collection { } void sparseHandler(Msg* m){ + auto const this_node = theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + auto const next_node = (this_node + 1) % num_nodes; + vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); + if (m->iter == 0 or m->iter == 1) { + EXPECT_EQ(getIndex().x() / 2, this_node); + } else if (m->iter == 2 or m->iter == 3 or m->iter == 4 or m->iter == 5 or m->iter == 6) { + EXPECT_EQ(getIndex().x() / 2, next_node); + } } }; From be7af02983c7d7ce262a35d19dddf5beba75ed20 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 12 Jun 2023 16:52:57 +0200 Subject: [PATCH 08/30] #2074: Disallow runing of OfflineLB for skipped phases. --- .../balance/lb_data_restart_reader.cc | 4 +-- .../balance/lb_data_restart_reader.h | 26 +++++++++---------- .../collection/balance/offlinelb/offlinelb.cc | 2 +- tests/unit/lb/test_offlinelb.cc | 20 +++++++++++--- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index 236727e97f..de2572f458 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -177,8 +177,8 @@ void LBDataRestartReader::determinePhasesToMigrate() { ); std::set_difference( - history_[curr]->begin(), history_[curr]->end(), - history_[next]->begin(), history_[next]->end(), + history_[curr]->begin(), history_[curr]->end(), + history_[next]->begin(), history_[next]->end(), std::inserter(departing, departing.begin()) ); diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 7abb6fc529..01e878d160 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -123,19 +123,6 @@ struct LBDataRestartReader : runtime::component::Component | num_phases_; } - /** - * \brief Find the next phase - * - * \param phase the current phase - * - * \return the next phase - */ - PhaseType findNextPhase(PhaseType phase) const { - auto iter = history_.upper_bound(phase); - vtAssert(iter != history_.end(), "Must have a valid phase"); - return iter->first; - } - /** * \brief Get the elements assigned for a given phase * @@ -186,6 +173,19 @@ struct LBDataRestartReader : runtime::component::Component } private: + /** + * \brief Find the next specified phase or an identical one + * + * \param phase the current phase + * + * \return the next phase + */ + PhaseType findNextPhase(PhaseType phase) const { + auto iter = history_.upper_bound(phase); + vtAssert(iter != history_.end(), "Must have a valid phase"); + return iter->first; + } + /** * \brief Reduce distribution changes globally to find where migrations need * to occur diff --git a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc index d81313a30a..0ed7636aab 100644 --- a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc +++ b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc @@ -59,7 +59,7 @@ void OfflineLB::runLB(LoadType) { for (auto&& elm : distro) { migrateObjectTo(elm, theContext()->getNode()); } - theLBDataReader()->clearDistro(nextPhase); + theLBDataReader()->clearDistro(phase_ + 1); } }}}} /* end namespace vt::vrt::collection::lb */ diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 68d5d7dc01..8b0fb6af4d 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -80,9 +80,9 @@ struct SimCol : vt::Collection { auto const num_nodes = theContext()->getNumNodes(); auto const next_node = (this_node + 1) % num_nodes; vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); - if (m->iter == 0 or m->iter == 1) { + if (m->iter == 0 or m->iter == 1 or m->iter == 2 or m->iter == 3 or m->iter == 4) { EXPECT_EQ(getIndex().x() / 2, this_node); - } else if (m->iter == 2 or m->iter == 3 or m->iter == 4 or m->iter == 5 or m->iter == 6) { + } else if (m->iter == 5 or m->iter == 6) { EXPECT_EQ(getIndex().x() / 2, next_node); } } @@ -224,8 +224,22 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { } stream = w->finish(); + // Preapre configuration file + std::string file_name = "test_offlinelb_2.txt"; + std::ofstream out(file_name); + out << "" + "0 OfflineLB\n" + "1 NoLB\n" + "2 NoLB\n" + "3 NoLB\n" + "4 OfflineLB\n" + "5 OfflineLB\n" + "6 NoLB\n"; + out.close(); + theConfig()->vt_lb = true; - theConfig()->vt_lb_name = "OfflineLB"; + theConfig()->vt_lb_file_name = "test_offlinelb_2.txt"; + auto up = LBDataRestartReader::construct(); curRT->theLBDataReader = up.get(); theLBDataReader()->readLBDataFromStream(std::move(stream)); From 8162f50e9a8a875bc636d93eb60ab0df12acd0be Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 13 Jun 2023 13:53:09 +0200 Subject: [PATCH 09/30] #2074: Disallow operating on non-consecutive phases --- .../balance/lb_data_restart_reader.cc | 50 +++++++++---------- .../balance/lb_data_restart_reader.h | 15 +----- tests/unit/lb/test_offlinelb.cc | 6 +-- 3 files changed, 26 insertions(+), 45 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index de2572f458..3dc0e14294 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -162,34 +162,32 @@ void LBDataRestartReader::determinePhasesToMigrate() { auto const this_node = theContext()->getNode(); runInEpochCollective("LBDataRestartReader::updateLocations", [&]{ - PhaseType curr = 0, next; - for (;curr < num_phases_ - 1;) { - next = findNextPhase(curr); - - local_changed_distro[curr] = *history_[curr] != *history_[next]; - if (local_changed_distro[curr]) { - std::set departing, arriving; - - std::set_difference( - history_[next]->begin(), history_[next]->end(), - history_[curr]->begin(), history_[curr]->end(), - std::inserter(arriving, arriving.begin()) - ); - - std::set_difference( - history_[curr]->begin(), history_[curr]->end(), - history_[next]->begin(), history_[next]->end(), - std::inserter(departing, departing.begin()) - ); - - for (auto&& d : departing) { - proxy_[d.getHomeNode()].send(this_node, next, d); - } - for (auto&& a : arriving) { - proxy_[a.getHomeNode()].send(this_node, next, a); + for (PhaseType curr = 0; curr < num_phases_ - 1; ++curr) { + if(history_.count(curr) && history_.count(curr + 1)) { + local_changed_distro[curr] = *history_[curr] != *history_[curr + 1]; + if (local_changed_distro[curr]) { + std::set departing, arriving; + + std::set_difference( + history_[curr + 1]->begin(), history_[curr + 1]->end(), + history_[curr]->begin(), history_[curr]->end(), + std::inserter(arriving, arriving.begin()) + ); + + std::set_difference( + history_[curr]->begin(), history_[curr]->end(), + history_[curr + 1]->begin(), history_[curr + 1]->end(), + std::inserter(departing, departing.begin()) + ); + + for (auto&& d : departing) { + proxy_[d.getHomeNode()].send(this_node, curr + 1, d); + } + for (auto&& a : arriving) { + proxy_[a.getHomeNode()].send(this_node, curr + 1, a); + } } } - curr = next; } }); diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 01e878d160..6298d7079d 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -128,7 +128,7 @@ struct LBDataRestartReader : runtime::component::Component * * \param[in] phase the phase * - * \return pointer to elements assigned to this node, guaranted to be not null + * \return pointer to elements assigned to this node, guaranteed to be not null */ std::shared_ptr> getDistro(PhaseType phase) const { auto iter = history_.find(phase); @@ -173,19 +173,6 @@ struct LBDataRestartReader : runtime::component::Component } private: - /** - * \brief Find the next specified phase or an identical one - * - * \param phase the current phase - * - * \return the next phase - */ - PhaseType findNextPhase(PhaseType phase) const { - auto iter = history_.upper_bound(phase); - vtAssert(iter != history_.end(), "Must have a valid phase"); - return iter->first; - } - /** * \brief Reduce distribution changes globally to find where migrations need * to occur diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 8b0fb6af4d..7d9b8788a3 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -77,13 +77,9 @@ struct SimCol : vt::Collection { void sparseHandler(Msg* m){ auto const this_node = theContext()->getNode(); - auto const num_nodes = theContext()->getNumNodes(); - auto const next_node = (this_node + 1) % num_nodes; vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); - if (m->iter == 0 or m->iter == 1 or m->iter == 2 or m->iter == 3 or m->iter == 4) { + if (m->iter == 0 or m->iter == 1 or m->iter == 2 or m->iter == 3 or m->iter == 4 or m->iter == 5 or m->iter == 6) { EXPECT_EQ(getIndex().x() / 2, this_node); - } else if (m->iter == 5 or m->iter == 6) { - EXPECT_EQ(getIndex().x() / 2, next_node); } } }; From 9e0a9c1ea3f2109c1c6e255ef5931259ff07ca7e Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 19 Oct 2023 16:00:18 +0200 Subject: [PATCH 10/30] #2074: Update implementation after resolving conflicts --- src/vt/vrt/collection/balance/offlinelb/offlinelb.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc index 0ed7636aab..b9abd6d03f 100644 --- a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc +++ b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc @@ -55,8 +55,8 @@ void OfflineLB::init(objgroup::proxy::Proxy in_proxy) { } void OfflineLB::runLB(LoadType) { - auto const& distro = theLBDataReader()->getDistro(phase_ + 1); - for (auto&& elm : distro) { + auto const distro = theLBDataReader()->getDistro(phase_ + 1); + for (auto&& elm : *distro) { migrateObjectTo(elm, theContext()->getNode()); } theLBDataReader()->clearDistro(phase_ + 1); From 21c8df879a6290891087c0446648398c81e310ce Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Wed, 29 Nov 2023 14:55:31 +0100 Subject: [PATCH 11/30] #2074: Use getUniqueFilename for file name in offlinelb tests --- tests/unit/lb/test_lb_reader.nompi.cc | 7 ++++--- tests/unit/lb/test_offlinelb.cc | 21 +++++++++------------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/unit/lb/test_lb_reader.nompi.cc b/tests/unit/lb/test_lb_reader.nompi.cc index d21fc7277f..5da78769d9 100644 --- a/tests/unit/lb/test_lb_reader.nompi.cc +++ b/tests/unit/lb/test_lb_reader.nompi.cc @@ -44,6 +44,7 @@ #include #include "test_harness.h" +#include "test_helpers.h" namespace vt { namespace tests { namespace unit { @@ -51,7 +52,7 @@ using TestLBReader = TestHarness; TEST_F(TestLBReader, test_lb_read_1) { - std::string file_name = "test_lb_read_1.txt"; + std::string file_name = getUniqueFilename(".txt"); std::ofstream out(file_name); out << "" "0 NoLB\n" @@ -105,7 +106,7 @@ TEST_F(TestLBReader, test_lb_read_1) { TEST_F(TestLBReader, test_lb_read_2) { - std::string file_name = "test_lb_read_2.txt"; + std::string file_name = getUniqueFilename(".txt"); std::ofstream out(file_name); out << "" "0 NoLB\n" @@ -198,7 +199,7 @@ TEST_F(TestLBReader, test_lb_read_2) { } TEST_F(TestLBReader, test_lb_read_3_with_offline_lb) { - std::string file_name = "test_lb_read_3_with_offline_lb.txt"; + std::string file_name = getUniqueFilename(".txt"); std::ofstream out(file_name); out << "" "0 NoLB\n" diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 7d9b8788a3..bcef890abd 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -48,6 +48,7 @@ #include #include "test_parallel_harness.h" +#include "test_helpers.h" namespace vt { namespace tests { namespace unit { namespace lb { @@ -78,7 +79,7 @@ struct SimCol : vt::Collection { void sparseHandler(Msg* m){ auto const this_node = theContext()->getNode(); vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); - if (m->iter == 0 or m->iter == 1 or m->iter == 2 or m->iter == 3 or m->iter == 4 or m->iter == 5 or m->iter == 6) { + if (m->iter >= 0 and m->iter <= 6) { EXPECT_EQ(getIndex().x() / 2, this_node); } } @@ -208,20 +209,16 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { metadata["type"] = "LBDatafile"; metadata["phases"] = phasesMetadata; - auto w = std::make_unique( + auto appender = std::make_unique( "phases", metadata, std::move(stream), true ); - for (PhaseType i = 0; i < num_phases; i++) { - // ignore skipped and identical phases - if(i != 1 && i != 2 && i != 3 && i != 5 && i != 6) { - auto j = dh.toJson(i); - w->addElm(*j); - } - } - stream = w->finish(); + // Add phases 0 and 4 + appender->addElm(*dh.toJson(0)); + appender->addElm(*dh.toJson(4)); + stream = appender->finish(); // Preapre configuration file - std::string file_name = "test_offlinelb_2.txt"; + std::string file_name = getUniqueFilenameWithRanks(".txt"); std::ofstream out(file_name); out << "" "0 OfflineLB\n" @@ -234,7 +231,7 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { out.close(); theConfig()->vt_lb = true; - theConfig()->vt_lb_file_name = "test_offlinelb_2.txt"; + theConfig()->vt_lb_file_name = file_name; auto up = LBDataRestartReader::construct(); curRT->theLBDataReader = up.get(); From c7ad33761c098a6983304f19551603d52e1c56c1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 30 Nov 2023 19:05:14 +0100 Subject: [PATCH 12/30] #2074: Update data selection for offlineLB sparse test --- .../balance/lb_data_restart_reader.h | 13 ++--- .../collection/balance/offlinelb/offlinelb.cc | 8 ++- tests/unit/lb/test_offlinelb.cc | 57 ++++++++++++------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 6298d7079d..830ff4aa5e 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -128,12 +128,14 @@ struct LBDataRestartReader : runtime::component::Component * * \param[in] phase the phase * - * \return pointer to elements assigned to this node, guaranteed to be not null + * \return pointer to elements assigned to this node if not skipped */ std::shared_ptr> getDistro(PhaseType phase) const { auto iter = history_.find(phase); - vtAssert(iter != history_.end() && iter->second != nullptr, "Must have a valid phase"); - return iter->second; + if (iter != history_.end()) { + return iter->second; + } + return nullptr; } /** @@ -142,10 +144,7 @@ struct LBDataRestartReader : runtime::component::Component * \param[in] phase the phase to clear */ void clearDistro(PhaseType phase) { - auto iter = history_.find(phase); - if (iter != history_.end()) { - history_.erase(iter); - } + history_.erase(phase); } /** diff --git a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc index b9abd6d03f..a7ddb3aad8 100644 --- a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc +++ b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc @@ -56,10 +56,12 @@ void OfflineLB::init(objgroup::proxy::Proxy in_proxy) { void OfflineLB::runLB(LoadType) { auto const distro = theLBDataReader()->getDistro(phase_ + 1); - for (auto&& elm : *distro) { - migrateObjectTo(elm, theContext()->getNode()); + if (distro) { + for (auto&& elm : *distro) { + migrateObjectTo(elm, theContext()->getNode()); + } + theLBDataReader()->clearDistro(phase_ + 1); } - theLBDataReader()->clearDistro(phase_ + 1); } }}}} /* end namespace vt::vrt::collection::lb */ diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index bcef890abd..db83c49c68 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -78,8 +78,15 @@ struct SimCol : vt::Collection { void sparseHandler(Msg* m){ auto const this_node = theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + auto const next_node = (this_node + 1) % num_nodes; + auto const prev_node = this_node - 1 >= 0 ? this_node - 1 : num_nodes - 1; vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); - if (m->iter >= 0 and m->iter <= 6) { + if (m->iter == 7 or m->iter == 8 or m->iter == 9) { + EXPECT_EQ(getIndex().x() / 2, next_node); + } else if (m->iter == 4 or m-> iter == 5) { + EXPECT_EQ(getIndex().x() / 2, prev_node); + } else { EXPECT_EQ(getIndex().x() / 2, this_node); } } @@ -171,8 +178,8 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { std::unordered_map> ids; int len = 2; - PhaseType num_phases = 7; - for (int i = 0; i < len; i++) { + PhaseType num_phases = 10; + for (int i = 0; i < len * 2; i++) { auto id = elm::ElmIDBits::createCollectionImpl(true, i+1, this_node, this_node); id.curr_node = this_node; ids[0].push_back(id); @@ -183,18 +190,19 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { } for (int i = 0; i < len; i++) { - auto pid = elm::ElmIDBits::createCollectionImpl(true, i+1, prev_node, this_node); auto nid = elm::ElmIDBits::createCollectionImpl(true, i+1, next_node, this_node); - ids[1].push_back(pid); - ids[2].push_back(pid); - ids[4].push_back(nid); - ids[5].push_back(nid); + auto pid = elm::ElmIDBits::createCollectionImpl(true, i+1, prev_node, this_node); + ids[4].push_back(pid); + ids[7].push_back(nid); } LBDataHolder dh; for (PhaseType i = 0; i < num_phases; i++) { - for (auto&& elm : ids[i]) { - dh.node_data_[i][elm] = LoadSummary{3}; + if (i != 1 and i != 2 and i != 5 and i != 8 and i != 9) { + auto& elms = ids[i]; + for(std::size_t j = 0; j < elms.size(); j++) { + dh.node_data_[i][elms[j]] = LoadSummary{ static_cast(i + j) + 3}; + } } } @@ -202,19 +210,23 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { std::stringstream stream{std::ios_base::out | std::ios_base::in}; nlohmann::json metadata, phasesMetadata; phasesMetadata["count"] = num_phases; - phasesMetadata["skipped"]["list"] = {2}; - phasesMetadata["skipped"]["range"] = {{3,3}}; - phasesMetadata["identical_to_previous"]["list"] = {1}; - phasesMetadata["identical_to_previous"]["range"] = {{5,6}}; + phasesMetadata["skipped"]["list"] = {9}; + phasesMetadata["skipped"]["range"] = {{1,2}}; + phasesMetadata["identical_to_previous"]["list"] = {8}; + phasesMetadata["identical_to_previous"]["range"] = {{5,5}}; metadata["type"] = "LBDatafile"; metadata["phases"] = phasesMetadata; auto appender = std::make_unique( "phases", metadata, std::move(stream), true ); - // Add phases 0 and 4 - appender->addElm(*dh.toJson(0)); - appender->addElm(*dh.toJson(4)); + for (PhaseType i = 0; i < num_phases; i++) { + // ignore skipped and identical phases + if(i != 1 and i != 2 and i != 5 and i != 8 and i != 9) { + auto j = dh.toJson(i); + appender->addElm(*j); + } + } stream = appender->finish(); // Preapre configuration file @@ -222,12 +234,15 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { std::ofstream out(file_name); out << "" "0 OfflineLB\n" - "1 NoLB\n" - "2 NoLB\n" - "3 NoLB\n" + "1 OfflineLB\n" + "2 OfflineLB\n" + "3 OfflineLB\n" "4 OfflineLB\n" "5 OfflineLB\n" - "6 NoLB\n"; + "6 OfflineLB\n" + "7 OfflineLB\n" + "8 OfflineLB\n" + "9 OfflineLB\n"; out.close(); theConfig()->vt_lb = true; From 2661a04e5476d14cf55f4a1da8a99af3984e6b1d Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 29 Feb 2024 15:15:50 +0100 Subject: [PATCH 13/30] #2074: Calculate number of phases after removal of the count_ property --- .../balance/lb_data_restart_reader.cc | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index 3dc0e14294..7f348805f6 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -79,8 +79,26 @@ void LBDataRestartReader::startup() { } void LBDataRestartReader::readHistory(LBDataHolder const& lbdh) { + auto find_max_data_phase = [&]() -> PhaseType { + if (lbdh.node_data_.empty()) { + return 0; + } + return std::max_element( + lbdh.node_data_.begin(), lbdh.node_data_.end(), + [](const auto& p1, const auto& p2) { return p1.first < p2.first; }) + ->first; + }; + + // Find last phase number + auto largest_data = find_max_data_phase(); + auto largest_identical = + lbdh.identical_phases_.size() > 0 ? *lbdh.identical_phases_.rbegin() : 0; + auto largest_skipped = + lbdh.skipped_phases_.size() > 0 ? *lbdh.skipped_phases_.rbegin() : 0; + num_phases_ = + std::max(std::max(largest_data, largest_identical), largest_skipped) + 1; + PhaseType last_found_phase = 0; - num_phases_ = lbdh.count_; for (PhaseType phase = 0; phase < num_phases_; phase++) { auto iter = lbdh.node_data_.find(phase); if (iter != lbdh.node_data_.end()) { @@ -93,11 +111,13 @@ void LBDataRestartReader::readHistory(LBDataHolder const& lbdh) { history_[phase]->insert(obj.first); } } - } else if(lbdh.identical_phases_.find(phase) != lbdh.identical_phases_.end()) { + } else if ( + lbdh.identical_phases_.find(phase) != lbdh.identical_phases_.end()) { // Phase is identical to previous one, use the shared pointer to data from previous phase addIdenticalPhase(phase, last_found_phase); - } else if(lbdh.skipped_phases_.find(phase) == lbdh.skipped_phases_.end()) { - vtAbort("Could not find data: Skipped phases needs to be listed in file metadata."); + } else if (lbdh.skipped_phases_.find(phase) == lbdh.skipped_phases_.end()) { + vtAbort("Could not find data: Skipped phases needs to be listed in file " + "metadata."); } } } From 6bda56eda6a8e80a3aede7b6210370d42383dd95 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 25 Mar 2024 15:15:52 +0100 Subject: [PATCH 14/30] #2074: Fix logic for exsiting offlineLB test --- .../vrt/collection/balance/lb_data_restart_reader.h | 8 +++----- .../vrt/collection/balance/offlinelb/offlinelb.cc | 8 +++----- tests/unit/lb/test_offlinelb.cc | 13 ++++++++----- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.h b/src/vt/vrt/collection/balance/lb_data_restart_reader.h index 830ff4aa5e..6373a81153 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.h +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.h @@ -132,10 +132,8 @@ struct LBDataRestartReader : runtime::component::Component */ std::shared_ptr> getDistro(PhaseType phase) const { auto iter = history_.find(phase); - if (iter != history_.end()) { - return iter->second; - } - return nullptr; + vtAssert(iter != history_.end(), "Must have a valid, not skipped phase"); + return iter->second; } /** @@ -193,7 +191,7 @@ struct LBDataRestartReader : runtime::component::Component std::vector changed_distro_; /// History of mapping that was read in from the data files - std::map>> history_; + std::unordered_map>> history_; struct DepartMsg : vt::Message { DepartMsg(NodeType in_depart_node, PhaseType in_phase, ElementIDStruct in_elm) diff --git a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc index a7ddb3aad8..b9abd6d03f 100644 --- a/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc +++ b/src/vt/vrt/collection/balance/offlinelb/offlinelb.cc @@ -56,12 +56,10 @@ void OfflineLB::init(objgroup::proxy::Proxy in_proxy) { void OfflineLB::runLB(LoadType) { auto const distro = theLBDataReader()->getDistro(phase_ + 1); - if (distro) { - for (auto&& elm : *distro) { - migrateObjectTo(elm, theContext()->getNode()); - } - theLBDataReader()->clearDistro(phase_ + 1); + for (auto&& elm : *distro) { + migrateObjectTo(elm, theContext()->getNode()); } + theLBDataReader()->clearDistro(phase_ + 1); } }}}} /* end namespace vt::vrt::collection::lb */ diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index db83c49c68..161ee50a9c 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -232,17 +232,20 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { // Preapre configuration file std::string file_name = getUniqueFilenameWithRanks(".txt"); std::ofstream out(file_name); + + // NoLB for phases on skipped list and one phase before them. + // Phases on the skipped list: 1, 2, 9 out << "" - "0 OfflineLB\n" - "1 OfflineLB\n" - "2 OfflineLB\n" + "0 NoLB\n" + "1 NoLB\n" + "2 NoLB\n" "3 OfflineLB\n" "4 OfflineLB\n" "5 OfflineLB\n" "6 OfflineLB\n" "7 OfflineLB\n" - "8 OfflineLB\n" - "9 OfflineLB\n"; + "8 NoLB\n" + "9 NoLB\n"; out.close(); theConfig()->vt_lb = true; From ec87267322831f299b286a3ce5435347978c3f8b Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 26 Mar 2024 13:19:18 +0100 Subject: [PATCH 15/30] #2074: Add case to the offlinelb test which will result in assert --- .../balance/lb_data_restart_reader.cc | 20 +++++++++---------- tests/unit/lb/test_offlinelb.cc | 14 +++++++++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc index 7f348805f6..2fe43be8f3 100644 --- a/src/vt/vrt/collection/balance/lb_data_restart_reader.cc +++ b/src/vt/vrt/collection/balance/lb_data_restart_reader.cc @@ -182,29 +182,29 @@ void LBDataRestartReader::determinePhasesToMigrate() { auto const this_node = theContext()->getNode(); runInEpochCollective("LBDataRestartReader::updateLocations", [&]{ - for (PhaseType curr = 0; curr < num_phases_ - 1; ++curr) { - if(history_.count(curr) && history_.count(curr + 1)) { - local_changed_distro[curr] = *history_[curr] != *history_[curr + 1]; - if (local_changed_distro[curr]) { + for (PhaseType i = 0; i < num_phases_ - 1; ++i) { + if(history_.count(i) && history_.count(i+1)) { + local_changed_distro[i] = *history_[i] != *history_[i+1]; + if (local_changed_distro[i]) { std::set departing, arriving; std::set_difference( - history_[curr + 1]->begin(), history_[curr + 1]->end(), - history_[curr]->begin(), history_[curr]->end(), + history_[i+1]->begin(), history_[i+1]->end(), + history_[i]->begin(), history_[i]->end(), std::inserter(arriving, arriving.begin()) ); std::set_difference( - history_[curr]->begin(), history_[curr]->end(), - history_[curr + 1]->begin(), history_[curr + 1]->end(), + history_[i]->begin(), history_[i]->end(), + history_[i+1]->begin(), history_[i+1]->end(), std::inserter(departing, departing.begin()) ); for (auto&& d : departing) { - proxy_[d.getHomeNode()].send(this_node, curr + 1, d); + proxy_[d.getHomeNode()].send(this_node, i+1, d); } for (auto&& a : arriving) { - proxy_[a.getHomeNode()].send(this_node, curr + 1, a); + proxy_[a.getHomeNode()].send(this_node, i+1, a); } } } diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 161ee50a9c..03f8f4cbf1 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -245,7 +245,7 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { "6 OfflineLB\n" "7 OfflineLB\n" "8 NoLB\n" - "9 NoLB\n"; + "9 OfflineLB\n"; // Set to OfflineLB to provoke crash out.close(); theConfig()->vt_lb = true; @@ -261,12 +261,22 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { .bulkInsert() .wait(); - for (PhaseType i = 0; i < num_phases; i++) { + // Do work for properly configured phases 0-8 + for (PhaseType i = 0; i <= 8; i++) { runInEpochCollective("run sparseHandler", [&]{ proxy.broadcastCollective(i); }); thePhase()->nextPhaseCollective(); } + + if(num_nodes == 1) { + // Try to run OfflineLB on phase 9 which will trigger assert. + PhaseType crashingPhase = 9; + runInEpochCollective("run sparseHandler", [&]{ + proxy.broadcastCollective(crashingPhase); + }); + EXPECT_DEATH(thePhase()->nextPhaseCollective(), ""); + } } #endif From 50a77732c22498c7541be97444d4af7b5c15d84a Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 26 Mar 2024 13:35:26 +0100 Subject: [PATCH 16/30] #2074: Update sparseHandler in offlineLB tests --- tests/unit/lb/test_offlinelb.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 03f8f4cbf1..be4604081b 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -82,12 +82,12 @@ struct SimCol : vt::Collection { auto const next_node = (this_node + 1) % num_nodes; auto const prev_node = this_node - 1 >= 0 ? this_node - 1 : num_nodes - 1; vt_debug_print(terse, lb, "sparseHandler: idx={}: elm={}\n", getIndex(), getElmID()); - if (m->iter == 7 or m->iter == 8 or m->iter == 9) { - EXPECT_EQ(getIndex().x() / 2, next_node); + if (m->iter <= 3 or m->iter == 6) { + EXPECT_EQ(getIndex().x() / 2, this_node); } else if (m->iter == 4 or m-> iter == 5) { EXPECT_EQ(getIndex().x() / 2, prev_node); - } else { - EXPECT_EQ(getIndex().x() / 2, this_node); + } else if (m->iter == 7 or m->iter == 8) { + EXPECT_EQ(getIndex().x() / 2, next_node); } } }; From 10d656c188fe73c4eadbfa498717afc729b656be Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 26 Mar 2024 14:26:12 +0100 Subject: [PATCH 17/30] #2074: Remove failing part of the test --- tests/unit/lb/test_offlinelb.cc | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index be4604081b..731244c811 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -86,7 +86,7 @@ struct SimCol : vt::Collection { EXPECT_EQ(getIndex().x() / 2, this_node); } else if (m->iter == 4 or m-> iter == 5) { EXPECT_EQ(getIndex().x() / 2, prev_node); - } else if (m->iter == 7 or m->iter == 8) { + } else if (m->iter == 7 or m->iter == 8 or m->iter == 9) { EXPECT_EQ(getIndex().x() / 2, next_node); } } @@ -245,7 +245,7 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { "6 OfflineLB\n" "7 OfflineLB\n" "8 NoLB\n" - "9 OfflineLB\n"; // Set to OfflineLB to provoke crash + "9 NoLB\n"; out.close(); theConfig()->vt_lb = true; @@ -262,21 +262,12 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { .wait(); // Do work for properly configured phases 0-8 - for (PhaseType i = 0; i <= 8; i++) { + for (PhaseType i = 0; i < num_phases; i++) { runInEpochCollective("run sparseHandler", [&]{ proxy.broadcastCollective(i); }); thePhase()->nextPhaseCollective(); } - - if(num_nodes == 1) { - // Try to run OfflineLB on phase 9 which will trigger assert. - PhaseType crashingPhase = 9; - runInEpochCollective("run sparseHandler", [&]{ - proxy.broadcastCollective(crashingPhase); - }); - EXPECT_DEATH(thePhase()->nextPhaseCollective(), ""); - } } #endif From 720e92075aefb6dd4b12a624526da176b0cd0ef8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Wed, 27 Mar 2024 16:03:51 +0100 Subject: [PATCH 18/30] #2074: Check the phase if needs OfflineLB even if requested in config --- src/vt/runtime/runtime.cc | 6 +++--- .../collection/balance/lb_invoke/lb_manager.cc | 5 +++++ tests/unit/lb/test_offlinelb.cc | 15 +++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index d62b1f9a07..74e5e124eb 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -435,10 +435,10 @@ bool Runtime::needLBDataRestartReader() { return ReadLBConfig::hasOfflineLB(); } } - return false; - } else + } #endif - return false; + + return false; } bool Runtime::initialize(bool const force_now) { diff --git a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc index 2de4a94fae..873632ae90 100644 --- a/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc +++ b/src/vt/vrt/collection/balance/lb_invoke/lb_manager.cc @@ -150,6 +150,11 @@ LBType LBManager::decideLBToRun(PhaseType phase, bool try_file) { } } + // Check if LBDataRestartReader requires to run OfflineLB for a given phase. + if(the_lb == LBType::OfflineLB && !theLBDataReader()->needsLB(phase)) { + the_lb = LBType::NoLB; + } + vt_debug_print( terse, lb, "LBManager::decidedLBToRun: phase={}, return lb_={}\n", diff --git a/tests/unit/lb/test_offlinelb.cc b/tests/unit/lb/test_offlinelb.cc index 731244c811..541b9225e7 100644 --- a/tests/unit/lb/test_offlinelb.cc +++ b/tests/unit/lb/test_offlinelb.cc @@ -233,19 +233,19 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { std::string file_name = getUniqueFilenameWithRanks(".txt"); std::ofstream out(file_name); - // NoLB for phases on skipped list and one phase before them. - // Phases on the skipped list: 1, 2, 9 + // Request OfflineLB for each phase. + // LBDataRestartReader will check beforehand if that phase requires OfflineLB. out << "" - "0 NoLB\n" - "1 NoLB\n" - "2 NoLB\n" + "0 OfflineLB\n" + "1 OfflineLB\n" + "2 OfflineLB\n" "3 OfflineLB\n" "4 OfflineLB\n" "5 OfflineLB\n" "6 OfflineLB\n" "7 OfflineLB\n" - "8 NoLB\n" - "9 NoLB\n"; + "8 OfflineLB\n" + "9 OfflineLB\n"; out.close(); theConfig()->vt_lb = true; @@ -261,7 +261,6 @@ TEST_F(TestOfflineLB, test_offlinelb_2) { .bulkInsert() .wait(); - // Do work for properly configured phases 0-8 for (PhaseType i = 0; i < num_phases; i++) { runInEpochCollective("run sparseHandler", [&]{ proxy.broadcastCollective(i); From 546c63a3236f4b1b5fbabc934b4ff8d64197ad2f Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 4 Apr 2024 14:24:21 +0200 Subject: [PATCH 19/30] #2074: Fix crash when using param: --vt_lb_name=OfflineLB --- src/vt/runtime/runtime.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 74e5e124eb..e42d27146f 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -435,6 +435,7 @@ bool Runtime::needLBDataRestartReader() { return ReadLBConfig::hasOfflineLB(); } } + return true; } #endif From 186d7309fc6d93b443a77bbd5ff21a07a63052e3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Fri, 19 Apr 2024 15:26:06 +0200 Subject: [PATCH 20/30] #2074: Add unit tests to cover different initialization combinations for OfflineLB --- tests/unit/runtime/test_initialization.cc | 195 ++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index cdecb2fbfb..23251ad3a5 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -47,8 +47,12 @@ #include "test_helpers.h" #include +#include +#include +#include #include +#include namespace vt { namespace tests { namespace unit { @@ -320,4 +324,195 @@ TEST_F(TestInitialization, test_preconfigure_and_initialization) { vt::initializePreconfigured(&comm, &appConfig, vtConfig.get()); } +void preapreLBDataFiles(const std::string file_name_without_ext) { + using LBDataHolder = vt::vrt::collection::balance::LBDataHolder; + using ElementIDStruct = vt::vrt::collection::balance::ElementIDStruct; + using LoadSummary = vt::vrt::collection::balance::LoadSummary; + + auto const this_node = theContext()->getNode(); + auto const num_nodes = theContext()->getNumNodes(); + auto const next_node = (this_node + 1) % num_nodes; + auto const prev_node = this_node - 1 >= 0 ? this_node - 1 : num_nodes - 1; + + std::unordered_map> ids; + int len = 2; + PhaseType num_phases = 7; + for (int i = 0; i < len; i++) { + auto id = vt::elm::ElmIDBits::createCollectionImpl(true, i+1, this_node, this_node); + id.curr_node = this_node; + ids[0].push_back(id); + id.curr_node = next_node; + ids[3].push_back(id); + id.curr_node = prev_node; + ids[6].push_back(id); + } + + for (int i = 0; i < len; i++) { + auto pid = vt::elm::ElmIDBits::createCollectionImpl(true, i+1, prev_node, this_node); + auto nid = vt::elm::ElmIDBits::createCollectionImpl(true, i+1, next_node, this_node); + ids[1].push_back(pid); + ids[2].push_back(pid); + ids[4].push_back(nid); + ids[5].push_back(nid); + } + + LBDataHolder dh; + for (PhaseType i = 0; i < num_phases; i++) { + for (auto&& elm : ids[i]) { + dh.node_data_[i][elm] = LoadSummary{3}; + } + } + + using JSONAppender = util::json::Appender; + std::stringstream stream{std::ios_base::out | std::ios_base::in}; + nlohmann::json metadata; + metadata["type"] = "LBDatafile"; + auto w = std::make_unique( + "phases", metadata, std::move(stream), true + ); + for (PhaseType i = 0; i < num_phases; i++) { + auto j = dh.toJson(i); + w->addElm(*j); + } + stream = w->finish(); + + // save to files + for (int i = 0; i < 9; i++) { + std::string file_name = file_name_without_ext + "." + std::to_string(i) + ".json"; + std::filesystem::path file_path = std::filesystem::current_path() / file_name; + std::ofstream out(file_path); + out << stream.str(); + out.close(); + } +} + +TEST_F(TestInitialization, test_initialize_without_restart_reader) { + MPI_Comm comm = MPI_COMM_WORLD; + + static char prog_name[]{"vt_program"}; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_TRUE(theLBDataReader() == nullptr); +} + +TEST_F(TestInitialization, test_initialize_with_lb_data_in) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); + preapreLBDataFiles(prefix); + + static char prog_name[]{"vt_program"}; + static char data_in[]{"--vt_lb_data_in"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(data_in); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_data_in, true); + EXPECT_TRUE(theLBDataReader() != nullptr); +} + +TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); + preapreLBDataFiles(prefix); + + // Preapre configuration file + std::string file_name = getUniqueFilenameWithRanks(".txt"); + std::ofstream out(file_name); + out << "0 OfflineLB\n"; + out.close(); + + static char prog_name[]{"vt_program"}; + static char data_in[]{"--vt_lb_data_in"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + std::string config_file = "--vt_lb_file_name=" + file_name; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(data_in); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(const_cast(config_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_data_in, true); + EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); + EXPECT_TRUE(theLBDataReader() != nullptr); +} + +TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); + preapreLBDataFiles(prefix); + + // Preapre configuration file + std::string file_name = getUniqueFilenameWithRanks(".txt"); + std::ofstream out(file_name); + out << "0 NoLB\n"; + out.close(); + + static char prog_name[]{"vt_program"}; + static char data_in[]{"--vt_lb_data_in"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + std::string config_file = "--vt_lb_file_name=" + file_name; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(data_in); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(const_cast(config_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_data_in, true); + EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); + EXPECT_TRUE(theLBDataReader() == nullptr); +} + }}} // end namespace vt::tests::unit From 44b9a6039486e4e157e10bc557f4fce600a63114 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 22 Apr 2024 17:38:34 +0200 Subject: [PATCH 21/30] #2074: Enable new tests only when Load Balancing is enabled --- tests/unit/runtime/test_initialization.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index 23251ad3a5..5ef119347e 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -404,6 +404,7 @@ TEST_F(TestInitialization, test_initialize_without_restart_reader) { EXPECT_TRUE(theLBDataReader() == nullptr); } +#if vt_feature_cmake_lblite TEST_F(TestInitialization, test_initialize_with_lb_data_in) { MPI_Comm comm = MPI_COMM_WORLD; @@ -474,6 +475,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); EXPECT_TRUE(theLBDataReader() != nullptr); } +#endif TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { MPI_Comm comm = MPI_COMM_WORLD; From b44c3014b627abb51cab7397e0939b1e6de77849 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 23 Apr 2024 16:30:22 +0200 Subject: [PATCH 22/30] #2074: Add flag for gcc-8 to compile std::filesystem --- cmake/link_vt.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/link_vt.cmake b/cmake/link_vt.cmake index 8b6303404b..1114cf7fbd 100644 --- a/cmake/link_vt.cmake +++ b/cmake/link_vt.cmake @@ -236,4 +236,11 @@ function(link_target_with_vt) if (vt_ubsan_enabled) target_link_libraries(${ARG_TARGET} PUBLIC ${ARG_BUILD_TYPE} -fsanitize=undefined) endif() + + # Enable additional flag for GCC-8 to link std::filesystem + if (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU") + if (NOT (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9)) + target_link_libraries(${ARG_TARGET} PUBLIC ${ARG_BUILD_TYPE} -lstdc++fs) + endif () + endif () endfunction() From f4da65d02708851f4da31174741b92e05d4a727b Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 23 May 2024 17:36:49 +0200 Subject: [PATCH 23/30] #2074: Fix creation of the test files --- tests/unit/runtime/test_initialization.cc | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index 5ef119347e..e2509f101f 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -324,7 +324,7 @@ TEST_F(TestInitialization, test_preconfigure_and_initialization) { vt::initializePreconfigured(&comm, &appConfig, vtConfig.get()); } -void preapreLBDataFiles(const std::string file_name_without_ext) { +void prepareLBDataFiles(const std::string file_name_without_ext) { using LBDataHolder = vt::vrt::collection::balance::LBDataHolder; using ElementIDStruct = vt::vrt::collection::balance::ElementIDStruct; using LoadSummary = vt::vrt::collection::balance::LoadSummary; @@ -376,14 +376,12 @@ void preapreLBDataFiles(const std::string file_name_without_ext) { } stream = w->finish(); - // save to files - for (int i = 0; i < 9; i++) { - std::string file_name = file_name_without_ext + "." + std::to_string(i) + ".json"; - std::filesystem::path file_path = std::filesystem::current_path() / file_name; - std::ofstream out(file_path); - out << stream.str(); - out.close(); - } + // save to file + std::string file_name = file_name_without_ext + "." + std::to_string(this_node) + ".json"; + std::filesystem::path file_path = std::filesystem::current_path() / file_name; + std::ofstream out(file_path); + out << stream.str(); + out.close(); } TEST_F(TestInitialization, test_initialize_without_restart_reader) { @@ -409,8 +407,8 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_in) { MPI_Comm comm = MPI_COMM_WORLD; // Preapre data files - auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); - preapreLBDataFiles(prefix); + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); static char prog_name[]{"vt_program"}; static char data_in[]{"--vt_lb_data_in"}; @@ -440,8 +438,8 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { MPI_Comm comm = MPI_COMM_WORLD; // Preapre data files - auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); - preapreLBDataFiles(prefix); + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); // Preapre configuration file std::string file_name = getUniqueFilenameWithRanks(".txt"); @@ -481,8 +479,8 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { MPI_Comm comm = MPI_COMM_WORLD; // Preapre data files - auto prefix = getUniqueFilenameWithRanks() + std::to_string(theContext()->getNode()); - preapreLBDataFiles(prefix); + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); // Preapre configuration file std::string file_name = getUniqueFilenameWithRanks(".txt"); From 894e7c364c9839f4ea8f1e4c057f22859f7bca1d Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Fri, 24 May 2024 12:57:26 +0200 Subject: [PATCH 24/30] #2074: Update unit tests to cover more combinations of OfflineLB parameters --- tests/unit/runtime/test_initialization.cc | 79 +++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index e2509f101f..6d0d1ec35d 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -399,6 +399,9 @@ TEST_F(TestInitialization, test_initialize_without_restart_reader) { vt::initialize(custom_argc, custom_argv, &comm); EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); + EXPECT_EQ(theConfig()->vt_lb_data_in, false); + EXPECT_EQ(theConfig()->vt_lb_file_name, ""); EXPECT_TRUE(theLBDataReader() == nullptr); } @@ -430,7 +433,9 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_in) { vt::initialize(custom_argc, custom_argv, &comm); EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); EXPECT_EQ(theConfig()->vt_lb_data_in, true); + EXPECT_EQ(theConfig()->vt_lb_file_name, ""); EXPECT_TRUE(theLBDataReader() != nullptr); } @@ -469,10 +474,83 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { vt::initialize(custom_argc, custom_argv, &comm); EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); EXPECT_EQ(theConfig()->vt_lb_data_in, true); EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); EXPECT_TRUE(theLBDataReader() != nullptr); } + +TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); + + // Preapre configuration file + std::string file_name = getUniqueFilenameWithRanks(".txt"); + std::ofstream out(file_name); + out << "0 OfflineLB\n"; + out.close(); + + static char prog_name[]{"vt_program"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + std::string config_file = "--vt_lb_file_name=" + file_name; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(const_cast(config_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); + EXPECT_EQ(theConfig()->vt_lb_data_in, false); + EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); + EXPECT_TRUE(theLBDataReader() != nullptr); +} + +TEST_F(TestInitialization, test_initialize_with_lb_data_and_lb_name) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); + + static char prog_name[]{"vt_program"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + std::string lb_name = "--vt_lb_name=OfflineLB"; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(const_cast(lb_name.c_str())); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "OfflineLB"); + EXPECT_EQ(theConfig()->vt_lb_data_in, false); + EXPECT_EQ(theConfig()->vt_lb_file_name, ""); + EXPECT_TRUE(theLBDataReader() != nullptr); +} #endif TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { @@ -510,6 +588,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { vt::initialize(custom_argc, custom_argv, &comm); EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); EXPECT_EQ(theConfig()->vt_lb_data_in, true); EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); EXPECT_TRUE(theLBDataReader() == nullptr); From 2281472d11b7cc4c3846eeaccdf5bea48b55bf6e Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Fri, 24 May 2024 14:04:00 +0200 Subject: [PATCH 25/30] #2074: Allow to create LBDataRestartReader when lb_data_in is not used --- src/vt/runtime/runtime.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index e42d27146f..7b9319d79a 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -425,9 +425,15 @@ bool Runtime::tryFinalize(bool const disable_sig) { bool Runtime::needLBDataRestartReader() { using vrt::collection::balance::ReadLBConfig; + using vrt::collection::balance::LBType; + using vrt::collection::balance::get_lb_names; #if vt_check_enabled(lblite) - if (arg_config_->config_.vt_lb_data_in) { + bool data_in = arg_config_->config_.vt_lb_data_in; + bool requested_offline_lb = arg_config_->config_.vt_lb_name == get_lb_names()[LBType::OfflineLB]; + bool has_file = arg_config_->config_.vt_lb_file_name != ""; + + if (data_in || requested_offline_lb || has_file) { auto& config_file = arg_config_->config_.vt_lb_file_name; if (config_file != "") { bool const has_spec = ReadLBConfig::openConfig(config_file); From 32ef8beef2fb2d5f82cf20e12f36890ec7b8d550 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 4 Jun 2024 17:47:40 +0200 Subject: [PATCH 26/30] #2074: Abort when OfflineLB is misconfigured --- src/vt/runtime/runtime.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 7b9319d79a..09f35d9623 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -429,11 +429,7 @@ bool Runtime::needLBDataRestartReader() { using vrt::collection::balance::get_lb_names; #if vt_check_enabled(lblite) - bool data_in = arg_config_->config_.vt_lb_data_in; - bool requested_offline_lb = arg_config_->config_.vt_lb_name == get_lb_names()[LBType::OfflineLB]; - bool has_file = arg_config_->config_.vt_lb_file_name != ""; - - if (data_in || requested_offline_lb || has_file) { + if (arg_config_->config_.vt_lb_data_in) { auto& config_file = arg_config_->config_.vt_lb_file_name; if (config_file != "") { bool const has_spec = ReadLBConfig::openConfig(config_file); @@ -442,6 +438,8 @@ bool Runtime::needLBDataRestartReader() { } } return true; + } else if (arg_config_->config_.vt_lb_name == get_lb_names()[LBType::OfflineLB]) { + vtAbort("VT cannot run OfflineLB without '--vt_lb_data_in' parameter."); } #endif From aab125786d64b2613413703a04b700b230f9d785 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Tue, 4 Jun 2024 17:48:14 +0200 Subject: [PATCH 27/30] #2074: Update tests after the change to show the error message when OfflineLB is not configured properly --- tests/unit/runtime/test_initialization.cc | 35 +---------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index 6d0d1ec35d..1c79700354 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -516,40 +516,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); EXPECT_EQ(theConfig()->vt_lb_data_in, false); EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); - EXPECT_TRUE(theLBDataReader() != nullptr); -} - -TEST_F(TestInitialization, test_initialize_with_lb_data_and_lb_name) { - MPI_Comm comm = MPI_COMM_WORLD; - - // Preapre data files - auto prefix = getUniqueFilenameWithRanks(); - prepareLBDataFiles(prefix); - - static char prog_name[]{"vt_program"}; - std::string data_file_dir = "--vt_lb_data_dir_in="; - data_file_dir += std::filesystem::current_path(); - std::string data_file = "--vt_lb_data_file_in="; - data_file += prefix + ".%p.json"; - std::string lb_name = "--vt_lb_name=OfflineLB"; - - std::vector custom_args; - custom_args.emplace_back(prog_name); - custom_args.emplace_back(const_cast(lb_name.c_str())); - custom_args.emplace_back(const_cast(data_file_dir.c_str())); - custom_args.emplace_back(const_cast(data_file.c_str())); - custom_args.emplace_back(nullptr); - - int custom_argc = custom_args.size() - 1; - char** custom_argv = custom_args.data(); - - vt::initialize(custom_argc, custom_argv, &comm); - - EXPECT_EQ(theConfig()->prog_name, "vt_program"); - EXPECT_EQ(theConfig()->vt_lb_name, "OfflineLB"); - EXPECT_EQ(theConfig()->vt_lb_data_in, false); - EXPECT_EQ(theConfig()->vt_lb_file_name, ""); - EXPECT_TRUE(theLBDataReader() != nullptr); + EXPECT_TRUE(theLBDataReader() == nullptr); } #endif From d797363d9503e259485636be39b46277f1f09974 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Mon, 10 Jun 2024 13:55:35 +0200 Subject: [PATCH 28/30] #2074: Update detection of OfflineLB parameters --- src/vt/runtime/runtime.cc | 21 +++++------ tests/unit/runtime/test_initialization.cc | 45 +++++++++++++++++++++-- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index 09f35d9623..b378a0254f 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -428,22 +428,21 @@ bool Runtime::needLBDataRestartReader() { using vrt::collection::balance::LBType; using vrt::collection::balance::get_lb_names; + bool needOfflineLB = false; + #if vt_check_enabled(lblite) - if (arg_config_->config_.vt_lb_data_in) { - auto& config_file = arg_config_->config_.vt_lb_file_name; - if (config_file != "") { - bool const has_spec = ReadLBConfig::openConfig(config_file); - if (has_spec) { - return ReadLBConfig::hasOfflineLB(); - } - } - return true; - } else if (arg_config_->config_.vt_lb_name == get_lb_names()[LBType::OfflineLB]) { + if (ReadLBConfig::openConfig(arg_config_->config_.vt_lb_file_name)) { + needOfflineLB = ReadLBConfig::hasOfflineLB(); + } + + needOfflineLB = needOfflineLB || arg_config_->config_.vt_lb_name == get_lb_names()[LBType::OfflineLB]; + + if (needOfflineLB && !arg_config_->config_.vt_lb_data_in) { vtAbort("VT cannot run OfflineLB without '--vt_lb_data_in' parameter."); } #endif - return false; + return needOfflineLB; } bool Runtime::initialize(bool const force_now) { diff --git a/tests/unit/runtime/test_initialization.cc b/tests/unit/runtime/test_initialization.cc index 1c79700354..00309635f1 100644 --- a/tests/unit/runtime/test_initialization.cc +++ b/tests/unit/runtime/test_initialization.cc @@ -436,7 +436,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_in) { EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); EXPECT_EQ(theConfig()->vt_lb_data_in, true); EXPECT_EQ(theConfig()->vt_lb_file_name, ""); - EXPECT_TRUE(theLBDataReader() != nullptr); + EXPECT_TRUE(theLBDataReader() == nullptr); } TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { @@ -480,7 +480,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_offline_lb) { EXPECT_TRUE(theLBDataReader() != nullptr); } -TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { +TEST_F(TestInitialization, test_initialize_with_lb_data_and_no_lb) { MPI_Comm comm = MPI_COMM_WORLD; // Preapre data files @@ -490,10 +490,11 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { // Preapre configuration file std::string file_name = getUniqueFilenameWithRanks(".txt"); std::ofstream out(file_name); - out << "0 OfflineLB\n"; + out << "0 NoLB\n"; out.close(); static char prog_name[]{"vt_program"}; + static char data_in[]{"--vt_lb_data_in"}; std::string data_file_dir = "--vt_lb_data_dir_in="; data_file_dir += std::filesystem::current_path(); std::string data_file = "--vt_lb_data_file_in="; @@ -502,6 +503,7 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { std::vector custom_args; custom_args.emplace_back(prog_name); + custom_args.emplace_back(data_in); custom_args.emplace_back(const_cast(data_file_dir.c_str())); custom_args.emplace_back(const_cast(data_file.c_str())); custom_args.emplace_back(const_cast(config_file.c_str())); @@ -514,10 +516,45 @@ TEST_F(TestInitialization, test_initialize_with_lb_data_without_data_in) { EXPECT_EQ(theConfig()->prog_name, "vt_program"); EXPECT_EQ(theConfig()->vt_lb_name, "NoLB"); - EXPECT_EQ(theConfig()->vt_lb_data_in, false); + EXPECT_EQ(theConfig()->vt_lb_data_in, true); EXPECT_EQ(theConfig()->vt_lb_file_name, file_name); EXPECT_TRUE(theLBDataReader() == nullptr); } + +TEST_F(TestInitialization, test_initialize_with_lb_data_and_offline_lb) { + MPI_Comm comm = MPI_COMM_WORLD; + + // Preapre data files + auto prefix = getUniqueFilenameWithRanks(); + prepareLBDataFiles(prefix); + + static char prog_name[]{"vt_program"}; + static char data_in[]{"--vt_lb_data_in"}; + static char offline_lb[]{"--vt_lb_name=OfflineLB"}; + std::string data_file_dir = "--vt_lb_data_dir_in="; + data_file_dir += std::filesystem::current_path(); + std::string data_file = "--vt_lb_data_file_in="; + data_file += prefix + ".%p.json"; + + std::vector custom_args; + custom_args.emplace_back(prog_name); + custom_args.emplace_back(data_in); + custom_args.emplace_back(offline_lb); + custom_args.emplace_back(const_cast(data_file_dir.c_str())); + custom_args.emplace_back(const_cast(data_file.c_str())); + custom_args.emplace_back(nullptr); + + int custom_argc = custom_args.size() - 1; + char** custom_argv = custom_args.data(); + + vt::initialize(custom_argc, custom_argv, &comm); + + EXPECT_EQ(theConfig()->prog_name, "vt_program"); + EXPECT_EQ(theConfig()->vt_lb_name, "OfflineLB"); + EXPECT_EQ(theConfig()->vt_lb_data_in, true); + EXPECT_EQ(theConfig()->vt_lb_file_name, ""); + EXPECT_TRUE(theLBDataReader() != nullptr); +} #endif TEST_F(TestInitialization, test_initialize_with_lb_data_and_config_no_lb) { From dd8ff7560b75cdc13f05eeb07236960aaf13cd94 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Wed, 12 Jun 2024 17:03:47 +0200 Subject: [PATCH 29/30] #2074: Add support for vtAbort to be called during init process --- src/vt/configs/error/stack_out.cc | 2 +- src/vt/runtime/runtime.cc | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/vt/configs/error/stack_out.cc b/src/vt/configs/error/stack_out.cc index 3dd30029bd..97c5fd1520 100644 --- a/src/vt/configs/error/stack_out.cc +++ b/src/vt/configs/error/stack_out.cc @@ -164,7 +164,7 @@ std::string prettyPrintStack(DumpStackType const& stack) { auto magenta = ::vt::debug::magenta(); auto yellow = ::vt::debug::yellow(); auto vt_pre = ::vt::debug::vtPre(); - auto node = ::vt::theContext()->getNode(); + auto node = ::vt::theContext() ? ::vt::theContext()->getNode() : -1; auto node_str = ::vt::debug::proc(node); auto prefix = vt_pre + node_str + " "; auto separator = fmt::format("{}{}{:-^120}{}\n", prefix, yellow, "", reset); diff --git a/src/vt/runtime/runtime.cc b/src/vt/runtime/runtime.cc index b378a0254f..d87182cd37 100644 --- a/src/vt/runtime/runtime.cc +++ b/src/vt/runtime/runtime.cc @@ -579,7 +579,7 @@ void Runtime::reset() { void Runtime::abort(std::string const abort_str, ErrorCodeType const code) { output(abort_str, code, true, true, false); - if (theConfig()->vt_throw_on_abort) { + if (theContext && theConfig()->vt_throw_on_abort) { throw std::runtime_error(abort_str); } else { aborted_ = true; @@ -640,7 +640,12 @@ void Runtime::output( fmt::print(stderr, "{}\n", prefix); } - if (!theConfig()->vt_no_stack) { + if (theContext == nullptr) { + // Too early in init process to check dump settings - always dump stack. + auto stack = debug::stack::dumpStack(); + auto stack_pretty = debug::stack::prettyPrintStack(stack); + fmt::print("{}", stack_pretty); + } else if (!theConfig()->vt_no_stack) { bool const on_abort = !theConfig()->vt_no_abort_stack; bool const on_warn = !theConfig()->vt_no_warn_stack; bool const dump = (error && on_abort) || (!error && on_warn); From 61d324a871376f00df1b43e8c161bea757976474 Mon Sep 17 00:00:00 2001 From: Arkadiusz Szczepkowicz Date: Thu, 13 Jun 2024 11:03:59 +0200 Subject: [PATCH 30/30] #2074: Support early call to vtAbort when tracing is enabled --- src/vt/collective/collective_ops.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/vt/collective/collective_ops.cc b/src/vt/collective/collective_ops.cc index b69d7d1122..022b457de8 100644 --- a/src/vt/collective/collective_ops.cc +++ b/src/vt/collective/collective_ops.cc @@ -293,8 +293,10 @@ void CollectiveAnyOps::abort( auto myrt = tls_rt ? tls_rt : ::vt::rt; if (myrt) { #if vt_check_enabled(trace_enabled) - //--- Try to flush most of the traces before aborting - myrt->theTrace->cleanupTracesFile(); + if (myrt->theTrace) { + //--- Try to flush most of the traces before aborting + myrt->theTrace->cleanupTracesFile(); + } #endif myrt->abort(str, code); } else if (vt::debug::preConfig()->vt_throw_on_abort) {