-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[no ci] add celerity blockchain for task divergence checking
- Loading branch information
Showing
15 changed files
with
924 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
#pragma once | ||
|
||
#include "recorders.h" | ||
#include <mutex> | ||
#include <thread> | ||
#include <vector> | ||
|
||
namespace celerity::detail { | ||
/** | ||
* @brief This class is a wrapper around a 1D vector that allows us to access it as a 2D array. | ||
* | ||
* It is used to send the task hashes to other nodes using MPI while keeping the code simple and readable. | ||
*/ | ||
template <typename T> | ||
struct mpi_2d_send_wrapper { | ||
public: | ||
mpi_2d_send_wrapper(size_t width, size_t height) : m_data(width * height), m_width(width){}; | ||
|
||
const T& operator[](std::pair<int, int> ij) const { | ||
assert(ij.first * m_width + ij.second < m_data.size()); | ||
return m_data[ij.first * m_width + ij.second]; | ||
} | ||
|
||
T* data() { return m_data.data(); } | ||
|
||
private: | ||
std::vector<T> m_data; | ||
const size_t m_width; | ||
}; | ||
|
||
/** | ||
* @brief This class gives a view into a const vector. | ||
* | ||
* It is used to give us the currently unhashed task records while keeping track of the offset and width. | ||
*/ | ||
template <typename T> | ||
struct window { | ||
public: | ||
window(const std::vector<T>& value) : m_value(value) {} | ||
|
||
const T& operator[](size_t i) const { | ||
assert(i >= 0 && i < m_width); | ||
return m_value[m_offset + i]; | ||
} | ||
|
||
size_t size() { | ||
m_width = m_value.size() - m_offset; | ||
return m_width; | ||
} | ||
|
||
void slide(size_t i) { | ||
assert(i == 0 || (i >= 0 && i <= m_width)); | ||
m_offset += i; | ||
m_width -= i; | ||
} | ||
|
||
private: | ||
const std::vector<T>& m_value; | ||
size_t m_offset = 0; | ||
size_t m_width = 0; | ||
}; | ||
|
||
using task_hash = size_t; | ||
using task_hash_data = mpi_2d_send_wrapper<task_hash>; | ||
using divergence_map = std::unordered_map<task_hash, std::vector<node_id>>; | ||
|
||
/** | ||
* @brief This class is the base implementation for the divergence check. | ||
* | ||
* It is responsible for collecting the task hashes from all nodes and checking for differences -> divergence. | ||
* When a divergence is found, the task record for the diverging task is printed and the program is terminated. | ||
* Additionally it also checks for deadlocks and prints a warning if one is detected. | ||
* | ||
* The class is abstract to allow a different divergence check implementation in tests | ||
*/ | ||
class abstract_block_chain { | ||
friend struct abstract_block_chain_testspy; | ||
|
||
public: | ||
abstract_block_chain(size_t num_nodes, node_id local_nid, const std::vector<task_record>& task_recorder, MPI_Comm comm) | ||
: m_local_nid(local_nid), m_num_nodes(num_nodes), m_sizes(num_nodes), m_task_recorder_window(task_recorder), m_comm(comm) {} | ||
|
||
abstract_block_chain(const abstract_block_chain&) = delete; | ||
abstract_block_chain(abstract_block_chain&&) = default; | ||
|
||
virtual ~abstract_block_chain() = default; | ||
|
||
abstract_block_chain& operator=(const abstract_block_chain&) = delete; | ||
abstract_block_chain& operator=(abstract_block_chain&&) = delete; | ||
|
||
virtual void stop() { m_is_running = false; }; | ||
|
||
protected: | ||
node_id m_local_nid; | ||
size_t m_num_nodes; | ||
|
||
std::vector<task_hash> m_hashes; | ||
std::vector<int> m_sizes; | ||
|
||
bool m_is_running = true; | ||
|
||
window<task_record> m_task_recorder_window; | ||
|
||
std::chrono::time_point<std::chrono::steady_clock> m_last_cleared = std::chrono::steady_clock::now(); | ||
|
||
MPI_Comm m_comm; | ||
|
||
void start() { m_is_running = true; }; | ||
|
||
virtual void run() = 0; | ||
|
||
virtual void divergence_out(const divergence_map& check_map, const int task_num) = 0; | ||
|
||
void add_new_hashes(); | ||
void clear(const int min_progress); | ||
virtual void allgather_sizes(); | ||
virtual void allgather_hashes(const int max_size, task_hash_data& data); | ||
std::pair<int, int> collect_sizes(); | ||
task_hash_data collect_hashes(const int max_size); | ||
divergence_map create_check_map(const task_hash_data& task_graphs, const int task_num) const; | ||
|
||
void check_for_deadlock() const; | ||
|
||
static void print_node_divergences(const divergence_map& check_map, const int task_num); | ||
|
||
static void print_task_record(const divergence_map& check_map, const task_record& task, const task_hash hash); | ||
|
||
virtual void dedub_print_task_record(const divergence_map& check_map, const int task_num) const; | ||
|
||
bool check_for_divergence(); | ||
}; | ||
|
||
/** | ||
* @brief This class is the main implementation for the divergence check. | ||
*/ | ||
class divergence_block_chain : public abstract_block_chain { | ||
public: | ||
divergence_block_chain(size_t num_nodes, node_id local_nid, const std::vector<task_record>& task_record, MPI_Comm comm, bool test_mode = false) | ||
: abstract_block_chain(num_nodes, local_nid, task_record, comm), m_test_mode(test_mode) { | ||
divergence_block_chain::start(); | ||
} | ||
|
||
divergence_block_chain(const divergence_block_chain&) = delete; | ||
divergence_block_chain(divergence_block_chain&&) = default; | ||
|
||
divergence_block_chain& operator=(const divergence_block_chain&) = delete; | ||
divergence_block_chain& operator=(divergence_block_chain&&) = delete; | ||
|
||
~divergence_block_chain() override { divergence_block_chain::stop(); } | ||
|
||
void start(); | ||
void stop() override; | ||
|
||
private: | ||
std::thread m_thread; | ||
bool m_test_mode = false; | ||
|
||
void run() override; | ||
void divergence_out(const divergence_map& check_map, const int task_num) override; | ||
}; | ||
}; // namespace celerity::detail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.