domargan · amdkhz · Oct 19, 2020 · Oct 26, 2020 · Oct 31, 2020 · Nov 1, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 set(CMAKE_VERBOSE_MAKEFILE ON)
+include(ExternalProject)
 
 project(parallel-packed-csr VERSION 0.1 LANGUAGES CXX)
 
@@ -14,6 +15,11 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
 set(PROJECT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src)
 
+ExternalProject_Add(googlebenchmark
+	URL "https://github.com/google/benchmark/archive/v1.5.0.tar.gz"
+	CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${parallel-packed-csr_BINARY_DIR}/deps -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+  )
+
 file(GLOB_RECURSE parallel-packed-csr_SOURCES "${PROJECT_SOURCE_DIR}/*.cpp")
 file(GLOB_RECURSE parallel-packed-csr_HEADERS "${PROJECT_SOURCE_DIR}/*.h")
 
@@ -38,6 +44,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 target_link_libraries(parallel-packed-csr PRIVATE Threads::Threads numa)
 
+add_dependencies(parallel-packed-csr googlebenchmark)
+target_link_libraries(parallel-packed-csr PRIVATE ${parallel-packed-csr_BINARY_DIR}/deps/lib/${CMAKE_SHARED_LIBRARY_PREFIX}benchmark.a)
+target_link_libraries(parallel-packed-csr PRIVATE ${parallel-packed-csr_BINARY_DIR}/deps/lib/${CMAKE_SHARED_LIBRARY_PREFIX}benchmark_main.a)
+target_include_directories(parallel-packed-csr SYSTEM PUBLIC ${parallel-packed-csr_BINARY_DIR}/deps/include)
+
 list(REMOVE_ITEM parallel-packed-csr_SOURCES ${PROJECT_SOURCE_DIR}/main.cpp)
 add_executable(tests ${parallel-packed-csr_SOURCES} ${parallel-packed-csr_TEST_SOURCES})
 add_executable(tests-tsan ${parallel-packed-csr_SOURCES} ${parallel-packed-csr_TEST_SOURCES})

diff --git a/src/main.cpp b/src/main.cpp
@@ -17,13 +17,17 @@
 #include <thread>
 #include <utility>
 #include <vector>
+#include <functional>
+#include <map>
 
 #include "thread_pool/thread_pool.h"
 #include "thread_pool_pppcsr/thread_pool_pppcsr.h"
 
+#include <benchmark/benchmark.h>
+
 using namespace std;
 
-enum class Operation { READ, ADD, DELETE };
+// enum class Operation { READ, ADD, DELETE };
 
 // Reads edge list with separator
 pair<vector<tuple<Operation, int, int>>, int> read_input(string filename, Operation defaultOp) {
@@ -85,6 +89,8 @@ void update_existing_graph(const vector<tuple<Operation, int, int>> &input, Thre
 template <typename ThreadPool_t>
 void execute(int threads, int size, const vector<tuple<Operation, int, int>> &core_graph,
              const vector<tuple<Operation, int, int>> &updates, std::unique_ptr<ThreadPool_t> &thread_pool) {
+
+
   // Load core graph
   update_existing_graph(core_graph, thread_pool.get(), threads, core_graph.size());
   // Do updates
@@ -114,6 +120,7 @@ int main(int argc, char *argv[]) {
   int num_nodes = 0;
   bool lock_search = true;
   bool insert = true;
+  bool balance = true;
   Version v = Version::PPPCSRNUMA;
   int partitions_per_domain = 1;
   vector<tuple<Operation, int, int>> core_graph;
@@ -130,6 +137,10 @@ int main(int argc, char *argv[]) {
       insert = true;
     } else if (s.rfind("-delete", 0) == 0) {
       insert = false;
+    } else if (s.rfind("-balance", 0) == 0) {
+      balance = true;
+    } else if (s.rfind("-cluster", 0) == 0) {
+      balance = false;
     } else if (s.rfind("-pppcsrnuma", 0) == 0) {
       v = Version::PPPCSRNUMA;
     } else if (s.rfind("-pppcsr", 0) == 0) {
@@ -174,16 +185,16 @@ int main(int argc, char *argv[]) {
     }
     case Version::PPPCSR: {
       auto thread_pool =
-          make_unique<ThreadPoolPPPCSR>(threads, lock_search, num_nodes + 1, partitions_per_domain, false);
+          make_unique<ThreadPoolPPPCSR>(threads, lock_search, num_nodes + 1, partitions_per_domain, false, balance);
       execute(threads, size, core_graph, updates, thread_pool);
       break;
     }
     default: {
       auto thread_pool =
-          make_unique<ThreadPoolPPPCSR>(threads, lock_search, num_nodes + 1, partitions_per_domain, true);
+          make_unique<ThreadPoolPPPCSR>(threads, lock_search, num_nodes + 1, partitions_per_domain, true, balance);
       execute(threads, size, core_graph, updates, thread_pool);
     }
   }
 
   return 0;
-}
+}
diff --git a/src/pcsr/PCSR.cpp b/src/pcsr/PCSR.cpp
@@ -69,7 +69,7 @@ void PCSR::resizeEdgeArray(size_t newSize) {
   edges.N = newSize;
   edges.logN = (1 << bsr_word(bsr_word(edges.N) * 2 + 1));
   edges.H = bsr_word(edges.N / edges.logN);
-  std::cout << "Edges: " << edges.N << " logN: " << edges.logN << " #count: " << edges.N / edges.logN << std::endl;
+  // std::cout << "Edges: " << edges.N << " logN: " << edges.logN << " #count: " << edges.N / edges.logN << std::endl;
 }
 
 void PCSR::clear() {

diff --git a/src/pcsr/PCSR.h b/src/pcsr/PCSR.h
@@ -14,6 +14,8 @@ using namespace std;
 #ifndef PCSR2_PCSR_H
 #define PCSR2_PCSR_H
 
+enum class Operation { READ, ADD, DELETE };
+
 /** Types */
 typedef struct _node {
   // beginning and end of the associated region in the edge list

diff --git a/src/pppcsr/PPPCSR.cpp b/src/pppcsr/PPPCSR.cpp
@@ -30,7 +30,7 @@ PPPCSR::PPPCSR(uint32_t init_n, uint32_t src_n, bool lock_search, int numDomain,
       partitions.emplace_back(partitionSize, partitionSize, lock_search, (use_numa) ? i : -1);
     }
   }
-  cout << "Number of partitions: " << partitions.size() << std::endl;
+  // cout << "Number of partitions: " << partitions.size() << std::endl;
 }
 
 bool PPPCSR::edge_exists(uint32_t src, uint32_t dest) {

diff --git a/src/thread_pool_pppcsr/thread_pool_pppcsr.cpp b/src/thread_pool_pppcsr/thread_pool_pppcsr.cpp
@@ -11,39 +11,53 @@
 #include <mutex>
 #include <thread>
 #include <vector>
+#include <cmath>
 
 using namespace std;
 
 /**
  * Initializes a pool of threads. Every thread has its own task queue.
  */
 ThreadPoolPPPCSR::ThreadPoolPPPCSR(const int NUM_OF_THREADS, bool lock_search, uint32_t init_num_nodes,
-                                   int partitions_per_domain, bool use_numa)
-    : tasks(NUM_OF_THREADS),
-      finished(false),
-      available_nodes(std::min(numa_max_node() + 1, NUM_OF_THREADS)),
+                                   int partitions_per_domain, bool use_numa, bool balance)
+    : finished(false),
+      available_nodes(numa_max_node() + 1),
       indeces(available_nodes, 0),
       partitions_per_domain(partitions_per_domain),
       threadToDomain(NUM_OF_THREADS),
+      threadToQueue(NUM_OF_THREADS),
       firstThreadDomain(available_nodes, 0),
-      numThreadsDomain(available_nodes) {
-  pcsr = new PPPCSR(init_num_nodes, init_num_nodes, lock_search, available_nodes, partitions_per_domain, use_numa);
-
-  int d = available_nodes;
-  int minNumThreads = NUM_OF_THREADS / d;
-  int threshold = NUM_OF_THREADS % d;
-  int counter = 0;
-  int currentDomain = 0;
-
-  for (int i = 0; i < NUM_OF_THREADS; i++) {
-    threadToDomain[i] = currentDomain;
-    counter++;
-    if (counter == minNumThreads + (currentDomain < threshold)) {
-      numThreadsDomain[currentDomain] = counter;
-      firstThreadDomain[currentDomain] = i - counter + 1;
-      counter = 0;
-      currentDomain++;
-    }
+      numThreadsDomain(available_nodes),
+      balance(balance) {
+
+        if(balance){
+          numberOfQueues = min(available_nodes * partitions_per_domain, NUM_OF_THREADS);
+        }
+        else{
+          auto threadsPerDomain = thread::hardware_concurrency()/available_nodes;
+          numberOfQueues = ceil(NUM_OF_THREADS/ (double)threadsPerDomain);
+        }
+
+        tasks = vector<moodycamel::ConcurrentQueue<task>>(numberOfQueues);
+
+        pcsr =
+            new PPPCSR(init_num_nodes, init_num_nodes, lock_search, available_nodes, partitions_per_domain, use_numa);
+
+        int d = available_nodes;
+        int minNumThreads = NUM_OF_THREADS / d;
+        int threshold = NUM_OF_THREADS % d;
+        int counter = 0;
+        int currentDomain = 0;
+
+        for (int i = 0; i < NUM_OF_THREADS; i++) {
+          threadToDomain[i] = currentDomain;
+          counter++;
+          if (counter == minNumThreads + (currentDomain < threshold)) {
+            numThreadsDomain[currentDomain] = counter;
+            firstThreadDomain[currentDomain] = i - counter + 1;
+            counter = 0;
+            currentDomain++;
+          }
   }
 }
 
@@ -52,21 +66,25 @@ ThreadPoolPPPCSR::ThreadPoolPPPCSR(const int NUM_OF_THREADS, bool lock_search, u
 // Finishes when finished is set to true and there are no outstanding tasks
 template <bool isMasterThread>
 void ThreadPoolPPPCSR::execute(const int thread_id) {
-  cout << "Thread " << thread_id << " has " << tasks[thread_id].size() << " tasks, runs on domain "
-       << threadToDomain[thread_id] << endl;
   if (numa_available() >= 0) {
     numa_run_on_node(threadToDomain[thread_id]);
   }
   int registered = -1;
+  auto queue_id = threadToQueue[thread_id];
+  auto queueCounter = 1;
+
+  while (!tasks[queue_id].empty() || (!isMasterThread && !finished)) {
+    while(queueCounter <= numberOfQueues && 
+            tasks[queue_id].empty()){
+        queue_id = (queue_id + 1) % numberOfQueues;
+        queueCounter++;
+    }
+    if (!tasks[queue_id].empty()) {
+      task t = tasks[queue_id].front();
 
-  while (!tasks[thread_id].empty() || (!isMasterThread && !finished)) {
-    if (!tasks[thread_id].empty()) {
-      task t = tasks[thread_id].front();
-      tasks[thread_id].pop();
-
-      int currentPar = pcsr->get_partiton(t.src);
+    int currentPar = threadToQueue[thread_id];
 
-      if (registered != currentPar) {
+    if (registered != currentPar) {
         if (registered != -1) {
           pcsr->unregisterThread(registered);
         }
@@ -81,10 +99,10 @@ void ThreadPoolPPPCSR::execute(const int thread_id) {
         pcsr->read_neighbourhood(t.src);
       }
     } else {
-      if (registered != -1) {
-        pcsr->unregisterThread(registered);
-        registered = -1;
-      }
+        if (registered != -1) {
+          pcsr->unregisterThread(registered);
+          registered = -1;
+        }
     }
   }
   if (registered != -1) {
@@ -94,50 +112,53 @@ void ThreadPoolPPPCSR::execute(const int thread_id) {
 
 // Submit an update for edge {src, target} to thread with number thread_id
 void ThreadPoolPPPCSR::submit_add(int thread_id, int src, int target) {
-  (void)thread_id;
-  auto par = pcsr->get_partiton(src) / partitions_per_domain;
-  auto index = (indeces[par]++) % numThreadsDomain[par];
-  tasks[firstThreadDomain[par] + index].push(task{true, false, src, target});
+  auto par = pcsr->get_partiton(src);
+  auto queue_id = balance ? queueTurn : par % numberOfQueues;
+  threadToQueue[thread_id] = queue_id;
+  queueTurn = (queueTurn + 1) % numberOfQueues;
+  tasks[queue_id].push(task{true, false, src, target});
 }
 
 // Submit a delete edge task for edge {src, target} to thread with number thread_id
 void ThreadPoolPPPCSR::submit_delete(int thread_id, int src, int target) {
-  (void)thread_id;
-  auto par = pcsr->get_partiton(src) / partitions_per_domain;
-  auto index = (indeces[par]++) % numThreadsDomain[par];
-  tasks[firstThreadDomain[par] + index].push(task{false, false, src, target});
+  auto par = pcsr->get_partiton(src);
+  auto queue_id = balance ? queueTurn : par % numberOfQueues;
+  threadToQueue[thread_id] = queue_id;
+  queueTurn = (queueTurn + 1) % numberOfQueues;
+  tasks[queue_id].push(task{false, false, src, target});
 }
 
 // Submit a read neighbourhood task for vertex src to thread with number thread_id
 void ThreadPoolPPPCSR::submit_read(int thread_id, int src) {
-  (void)thread_id;
-  auto par = pcsr->get_partiton(src) / partitions_per_domain;
-  auto index = (indeces[par]++) % numThreadsDomain[par];
-  tasks[firstThreadDomain[par] + index].push(task{false, true, src, src});
+  auto par = pcsr->get_partiton(src);
+  auto queue_id = balance ? queueTurn : par % numberOfQueues;
+  threadToQueue[thread_id] = queue_id;
+  queueTurn = (queueTurn + 1) % numberOfQueues;
+  tasks[queue_id].push(task{false, true, src, src});
 }
 
 // starts a new number of threads
 // number of threads is passed to the constructor
 void ThreadPoolPPPCSR::start(int threads) {
   s = chrono::steady_clock::now();
-  finished = false;
+  // finished = false;
 
   for (int i = 1; i < threads; i++) {
     thread_pool.push_back(thread(&ThreadPoolPPPCSR::execute<false>, this, i));
     // Pin thread to core
-    //    cpu_set_t cpuset;
-    //    CPU_ZERO(&cpuset);
-    //    CPU_SET((i * 4), &cpuset);
-    //    if (i >= 4) {
-    //      CPU_SET(1 + (i * 4), &cpuset);
-    //    } else {
-    //      CPU_SET(i * 4, &cpuset);
-    //    }
-    //    int rc = pthread_setaffinity_np(thread_pool.back().native_handle(),
-    //                                    sizeof(cpu_set_t), &cpuset);
-    //    if (rc != 0) {
-    //      cout << "error pinning thread" << endl;
-    //    }
+      //  cpu_set_t cpuset;
+      //  CPU_ZERO(&cpuset);
+      //  CPU_SET((i * 4), &cpuset);
+      //  if (i >= 4) {
+      //    CPU_SET(1 + (i * 4), &cpuset);
+      //  } else {
+      //    CPU_SET(i * 4, &cpuset);
+      //  }
+      //  int rc = pthread_setaffinity_np(thread_pool.back().native_handle(),
+      //                                  sizeof(cpu_set_t), &cpuset);
+      //  if (rc != 0) {
+      //    cout << "error pinning thread" << endl;
+      //  }
   }
   execute<true>(0);
 }
@@ -148,7 +169,7 @@ void ThreadPoolPPPCSR::stop() {
   finished = true;
   for (auto &&t : thread_pool) {
     if (t.joinable()) t.join();
-    cout << "Done" << endl;
+    // cout << "Done" << endl;
   }
   end = chrono::steady_clock::now();
   cout << "Elapsed wall clock time: " << chrono::duration_cast<chrono::milliseconds>(end - s).count() << endl;

diff --git a/src/thread_pool_pppcsr/thread_pool_pppcsr.h b/src/thread_pool_pppcsr/thread_pool_pppcsr.h
@@ -8,18 +8,21 @@
 #include <vector>
 
 #include "../pppcsr/PPPCSR.h"
+#include "../utility/concurrentqueue.h"
 #include "task.h"
 
 using namespace std;
 #ifndef PPPCSR_THREAD_POOL_H
 #define PPPCSR_THREAD_POOL_H
 
+// enum class Operation { READ, ADD, DELETE };
+
 class ThreadPoolPPPCSR {
  public:
   PPPCSR *pcsr;
 
   explicit ThreadPoolPPPCSR(const int NUM_OF_THREADS, bool lock_search, uint32_t init_num_nodes,
-                            int partitions_per_domain, bool use_numa);
+                            int partitions_per_domain, bool use_numa, bool balance);
   ~ThreadPoolPPPCSR() = default;
   /** Public API */
   void submit_add(int thread_id, int src, int dest);     // submit task to thread {thread_id} to insert edge {src, dest}
@@ -30,7 +33,8 @@ class ThreadPoolPPPCSR {
 
  private:
   vector<thread> thread_pool;
-  vector<queue<task>> tasks;
+  vector<moodycamel::ConcurrentQueue<task>> tasks;
+  size_t numberOfQueues;
   chrono::steady_clock::time_point s;
   chrono::steady_clock::time_point end;
   std::atomic_bool finished;
@@ -39,9 +43,12 @@ class ThreadPoolPPPCSR {
   void execute(int);
 
   const int available_nodes;
+  const bool balance;
+  size_t queueTurn = 0;
   std::vector<unsigned> indeces;
   int partitions_per_domain = 1;
   std::vector<int> threadToDomain;
+  std::vector<int> threadToQueue;
   std::vector<int> firstThreadDomain;
   std::vector<int> numThreadsDomain;
 };