facebookresearch · mpmisko · Aug 29, 2019 · Sep 4, 2019
diff --git a/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h b/sparseconvnet/SCN/Metadata/SubmanifoldConvolutionRules.h
@@ -7,6 +7,8 @@
 #ifndef SUBMANIFOLDCONVOLUTIONRULES_H
 #define SUBMANIFOLDCONVOLUTIONRULES_H
 
+#include <algorithm>
+
 // Full input region for an output point
 template <Int dimension>
 RectangularRegion<dimension>
@@ -22,7 +24,6 @@ InputRegionCalculator_Submanifold(const Point<dimension> &output, long *size) {
 
 // Call for each convolutional / max-pooling layer, once for each batch item.
 // rules is used to carry out the "lowering" whilst carrying out the convolution
-
 template <Int dimension>
 double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
                                         RuleBook &rules, long *size) {
@@ -44,41 +45,104 @@ double SubmanifoldConvolution_SgToRules(SparseGrid<dimension> &grid,
   return countActiveInputs;
 }
 
+template <Int dimension>
+double SubmanifoldConvolution_SgToRules_par(SparseGrid<dimension> &grid,
+                                            std::vector<RuleBook> &rulebooks,
+                                            long *size, const Int threadCount) {
+  double countActiveInputs = 0;
+  std::vector<std::thread> threads;
+  std::vector<int> activeInputs(threadCount, 0);
+
+  auto func = [&](const int order) {
+    auto outputIter = grid.mp.begin();
+    auto &rb = rulebooks[order];
+    int rem = grid.mp.size();
+    int aciveInputCount = 0;
+
+    if (rem > order) {
+      std::advance(outputIter, order);
+      rem -= order;
+
+      for (; outputIter != grid.mp.end();
+           std::advance(outputIter, std::min(threadCount, rem)),
+           rem -= threadCount) {
+        auto inRegion = InputRegionCalculator_Submanifold<dimension>(
+            outputIter->first, size);
+        Int rulesOffset = 0;
+        for (auto inputPoint : inRegion) {
+          auto inputIter = grid.mp.find(inputPoint);
+          if (inputIter != grid.mp.end()) {
+            aciveInputCount++;
+            rb[rulesOffset].push_back(inputIter->second + grid.ctr);
+            rb[rulesOffset].push_back(outputIter->second + grid.ctr);
+          }
+          rulesOffset++;
+        }
+      }
+    }
+
+    activeInputs[order] = aciveInputCount;
+  };
+
+  for (Int t = 0; t < threadCount; ++t) {
+    threads.push_back(std::thread(func, t));
+  }
+
+  for (Int t = 0; t < threadCount; ++t) {
+    threads[t].join();
+    countActiveInputs += activeInputs[t];
+  }
+
+  return countActiveInputs;
+}
+
 template <Int dimension>
 Int SubmanifoldConvolution_SgsToRules(SparseGrids<dimension> &SGs,
                                       RuleBook &rules, long *size) {
   Int sd = volume<dimension>(size);
   Int countActiveInputs = 0;
+
   rules.clear();
   rules.resize(sd);
   for (Int i = 0; i < (Int)SGs.size(); i++)
     countActiveInputs +=
         SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rules, size);
   return countActiveInputs;
 }
+
 template <Int dimension>
 Int SubmanifoldConvolution_SgsToRules_OMP(SparseGrids<dimension> &SGs,
                                           RuleBook &rules, long *size) {
-  std::vector<RuleBook> rbs(SGs.size());
+  std::vector<std::vector<RuleBook>> rbs(SGs.size());
   std::vector<double> countActiveInputs(SGs.size());
   rules.clear();
   Int sd = volume<dimension>(size);
   rules.resize(sd);
+  const Int threadCount = 4;
+
+  for (Int i = 0; i < SGs.size(); ++i) {
+    std::vector<RuleBook> rulebooks;
+    for (Int t = 0; t < threadCount; ++t) {
+      rulebooks.push_back(RuleBook(sd));
+    }
+    rbs.push_back(rulebooks);
+  }
+
   {
     Int i;
 #pragma omp parallel for private(i)
     for (i = 0; i < (Int)SGs.size(); i++) {
-      rbs[i].resize(sd);
-      countActiveInputs[i] =
-          SubmanifoldConvolution_SgToRules<dimension>(SGs[i], rbs[i], size);
+      countActiveInputs[i] = SubmanifoldConvolution_SgToRules_par<dimension>(
+          SGs[i], rbs[i], size, threadCount);
     }
   }
   {
     Int i;
 #pragma omp parallel for private(i)
     for (i = 0; i < sd; i++)
       for (auto const &rb : rbs)
-        rules[i].insert(rules[i].end(), rb[i].begin(), rb[i].end());
+        for (Int t = 0; t < threadCount; ++t)
+          rules[i].insert(rules[i].end(), rb[i][t].begin(), rb[i][t].end());
   }
   Int countActiveInputs_ = 0;
   for (auto &i : countActiveInputs)