caffe2: use at::mt19937 instead of std::mt19937 (10x speedup) (pytorch#43987)

d4l3k · facebook-github-bot · commit 0c9787c7584f · 2020-10-16T16:08:35.000-07:00
Summary: Pull Request resolved: pytorch#43987 This replaces the caffe2 CPU random number (std::mt19937) with at::mt19937 which is the one currently used in pytorch. The ATen RNG is 10x faster than the std one and appears to be more robust given bugs in the std (https://fburl.com/diffusion/uhro7lqb) For large embedding tables (10GB+) we see UniformFillOp taking upwards of 10 minutes as we're bottlenecked on the single threaded RNG. Swapping to at::mt19937 cuts that time to 10% of the current. Test Plan: Ran all relevant tests + CI. This doesn't introduce new features (+ is a core change) so existing tests+CI should be sufficient to catch regressions. Reviewed By: dzhulgakov Differential Revision: D23219710 fbshipit-source-id: bd16ed6415b2933e047bcb283a013d47fb395814
diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h
@@ -197,7 +197,7 @@ template <typename T>
 struct normal_distribution {
 
   C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) {
-    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);
+    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);
     mean = mean_in;
     stdv = stdv_in;
   }
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
@@ -15,6 +15,13 @@
 
 #include <c10/util/ArrayRef.h>
 
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+#include <ATen/core/DistributionsHelper.h>
+#include <ATen/CPUGeneratorImpl.h>
+#else
+#include "caffe2/core/distributions_stubs.h"
+#endif
+
 C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
@@ -39,7 +46,12 @@ CAFFE2_API uint32_t RandomNumberSeed();
  */
 class CAFFE2_API CPUContext final : public BaseContext {
  public:
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+  typedef at::CPUGeneratorImpl rand_gen_type;
+#else
   typedef std::mt19937 rand_gen_type;
+#endif
+
   CPUContext() {}
   explicit CPUContext(const DeviceOption& option)
       : random_seed_(option.has_random_seed() ? option.random_seed() : 1701),
@@ -66,11 +78,11 @@ class CAFFE2_API CPUContext final : public BaseContext {
 
   inline void FinishDeviceComputation() override {}
 
-  inline rand_gen_type& RandGenerator() {
+  inline rand_gen_type* RandGenerator() {
     if (!random_generator_.get()) {
       random_generator_.reset(new rand_gen_type(RandSeed()));
     }
-    return *random_generator_.get();
+    return random_generator_.get();
   }
 
   inline uint32_t RandSeed() {
diff --git a/caffe2/core/distributions_stubs.h b/caffe2/core/distributions_stubs.h
@@ -0,0 +1,75 @@
+#ifndef CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_
+#define CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_
+
+#include <c10/macros/Macros.h>
+
+/**
+ * This file provides distributions compatible with
+ * ATen/core/DistributionsHelper.h but backed with the std RNG implementation
+ * instead of the ATen one.
+ *
+ * Caffe2 mobile builds currently do not depend on all of ATen so this is
+ * required to allow using the faster ATen RNG for normal builds but keep the
+ * build size small on mobile. RNG performance typically doesn't matter on
+ * mobile builds since the models are small and rarely using random
+ * initialization.
+ */
+
+namespace at {
+namespace {
+
+template <typename R, typename T>
+struct distribution_adapter {
+  template <typename... Args>
+  C10_HOST_DEVICE inline distribution_adapter(Args... args)
+      : distribution_(std::forward<Args>(args)...) {}
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline R operator()(RNG generator) {
+    return distribution_(*generator);
+  }
+
+ private:
+  T distribution_;
+};
+
+template <typename T>
+struct uniform_int_from_to_distribution
+    : distribution_adapter<T, std::uniform_int_distribution<T>> {
+  C10_HOST_DEVICE inline uniform_int_from_to_distribution(
+      uint64_t range,
+      int64_t base)
+      : distribution_adapter<T, std::uniform_int_distribution<T>>(
+            base,
+            // std is inclusive, at is exclusive
+            base + range - 1) {}
+};
+
+template <typename T>
+using uniform_real_distribution =
+    distribution_adapter<T, std::uniform_real_distribution<T>>;
+
+template <typename T>
+using normal_distribution =
+    distribution_adapter<T, std::normal_distribution<T>>;
+
+template <typename T>
+using bernoulli_distribution =
+    distribution_adapter<T, std::bernoulli_distribution>;
+
+template <typename T>
+using exponential_distribution =
+    distribution_adapter<T, std::exponential_distribution<T>>;
+
+template <typename T>
+using cauchy_distribution =
+    distribution_adapter<T, std::cauchy_distribution<T>>;
+
+template <typename T>
+using lognormal_distribution =
+    distribution_adapter<T, std::lognormal_distribution<T>>;
+
+} // namespace
+} // namespace at
+
+#endif // CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_
diff --git a/caffe2/operators/dataset_ops.cc b/caffe2/operators/dataset_ops.cc
@@ -987,10 +987,9 @@ class CollectTensorOp final : public Operator<Context> {
       // append
       pos = numVisited_;
     } else {
-      auto& gen = context_.RandGenerator();
       // uniform between [0, numVisited_]
-      std::uniform_int_distribution<int> uniformDist(0, numVisited_);
-      pos = uniformDist(gen);
+      at::uniform_int_from_to_distribution<int> uniformDist(numVisited_+1, 0);
+      pos = uniformDist(context_.RandGenerator());
       if (pos >= numToCollect_) {
         // discard
         pos = -1;
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
@@ -17,15 +17,15 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
     float scale = 1. / (1. - ratio_);
     // mask=true means keep, and mask=false means not keep, so we will
     // generate probability depending on 1-ratio.
-    std::bernoulli_distribution dist(1. - ratio_);
+    at::bernoulli_distribution<double> dist(1. - ratio_);
     const float* Xdata = X.data<float>();
     float* Ydata = Y->template mutable_data<float>();
 
     auto mask = Output(1, X.sizes(), at::dtype<bool>());
     bool* mask_data = mask->template mutable_data<bool>();
-    auto& gen = context_.RandGenerator();
+    auto* gen = context_.RandGenerator();
     for (int i = 0; i < X.numel(); ++i) {
-      mask_data[i] = dist(gen);
+      mask_data[i] = dist(gen) > 0.5;
       Ydata[i] = Xdata[i] * scale * mask_data[i];
     }
     return true;
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
@@ -92,6 +92,7 @@ class FillerOp : public Operator<Context> {
       }
       shape.insert(shape.end(), extra_shape_.begin(), extra_shape_.end());
       output->Resize(shape);
+      shape_ = shape;
     } else {
       output->Resize(shape_);
     }
diff --git a/caffe2/operators/reservoir_sampling.cc b/caffe2/operators/reservoir_sampling.cc
@@ -150,10 +150,9 @@ class ReservoirSamplingOp final : public Operator<Context> {
         // append
         pos = *num_visited;
       } else {
-        auto& gen = context_.RandGenerator();
         // uniform between [0, num_visited]
-        std::uniform_int_distribution<int64_t> uniformDist(0, *num_visited);
-        pos = uniformDist(gen);
+        at::uniform_int_from_to_distribution<int64_t> uniformDist(*num_visited+1, 0);
+        pos = uniformDist(context_.RandGenerator());
         if (pos >= numToCollect_) {
           // discard
           pos = -1;
diff --git a/caffe2/operators/sparse_dropout_with_replacement_op.cc b/caffe2/operators/sparse_dropout_with_replacement_op.cc
@@ -26,12 +26,12 @@ bool SparseDropoutWithReplacementOp<CPUContext>::RunOnDevice() {
       X.numel(),
       "Inconsistent input data. Number of elements should match total length.");
 
-  std::bernoulli_distribution dist(1. - ratio_);
-  auto& gen = context_.RandGenerator();
+  at::bernoulli_distribution<double> dist(1. - ratio_);
+  auto* gen = context_.RandGenerator();
   int32_t total_output_length = 0;
   vector<bool> selected(Lengths.numel(), true);
   for (int i = 0; i < Lengths.numel(); ++i) {
-    if (dist(gen)) {
+    if (dist(gen) > 0.5) {
       output_lengths_data[i] = input_lengths_data[i];
     } else {
       // Replace with a single dropout value.  Even if input length is 0.
diff --git a/caffe2/predictor/predictor_test.cc b/caffe2/predictor/predictor_test.cc
@@ -187,7 +187,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
   EXPECT_EQ(output.front().sizes().size(), 2);
   EXPECT_EQ(output.front().size(0), 1);
   EXPECT_EQ(output.front().size(1), 10);
-  EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
+  EXPECT_NEAR(output.front().data<float>()[4], 4.9556, 1E-4);
 }
 
 TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
@@ -202,7 +202,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
   EXPECT_EQ(output.front().sizes().size(), 2);
   EXPECT_EQ(output.front().size(0), 1);
   EXPECT_EQ(output.front().size(1), 10);
-  EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
+  EXPECT_NEAR(output.front().data<float>()[4], 4.9556, 1E-4);
 }
 
 } // namespace caffe2
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
@@ -110,7 +110,7 @@ C10_EXPORT void Gemm<float, CPUContext>(
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
-          return;  // The line above calls `abort()`. Should never reach here.
+          return; // The line above calls `abort()`. Should never reach here.
       }
     }
     case CblasTrans: {
@@ -127,7 +127,7 @@ C10_EXPORT void Gemm<float, CPUContext>(
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
-          return;  // The line above calls `abort()`. Should never reach here.
+          return; // The line above calls `abort()`. Should never reach here.
       }
     }
     default:
@@ -177,7 +177,7 @@ C10_EXPORT void GemmEx<float, CPUContext>(
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
-          return;  // The line above calls `abort()`. Should never reach here.
+          return; // The line above calls `abort()`. Should never reach here.
       }
     }
     case CblasTrans: {
@@ -201,7 +201,7 @@ C10_EXPORT void GemmEx<float, CPUContext>(
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
-          return;  // The line above calls `abort()`. Should never reach here.
+          return; // The line above calls `abort()`. Should never reach here.
       }
     }
     default:
@@ -1065,11 +1065,23 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #undef DELEGATE_BROADCAST_BINARY_FUNCTION
 
+namespace {
+// incrementIfNotMax increments the number if the value is not max for that
+// datatype. This ensures that the value never overflows.
+template <typename T>
+inline T incrementIfNotMax(T a) {
+  if (a == std::numeric_limits<T>::max()) {
+    return a;
+  }
+  return a + 1;
+}
+} // namespace
+
 #define CAFFE2_RAND_UNIFORM_REAL(T)                                      \
   template <>                                                            \
   C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
-    std::uniform_real_distribution<T> distribution(a, b);                \
+    at::uniform_real_distribution<T> distribution(a, b);                 \
     for (size_t i = 0; i < n; ++i) {                                     \
       r[i] = distribution(context->RandGenerator());                     \
     }                                                                    \
@@ -1078,14 +1090,15 @@ CAFFE2_RAND_UNIFORM_REAL(float);
 CAFFE2_RAND_UNIFORM_REAL(double);
 #undef CAFFE2_RAND_UNIFORM_REAL
 
-#define CAFFE2_RAND_UNIFORM_CHAR(T)                                        \
-  template <>                                                              \
-  C10_EXPORT void RandUniform<T, CPUContext>(                              \
-      const size_t n, const T a, const T b, T* r, CPUContext* context) {   \
-    std::uniform_int_distribution<short> distribution((short)a, (short)b); \
-    for (size_t i = 0; i < n; ++i) {                                       \
-      r[i] = static_cast<T>(distribution(context->RandGenerator()));       \
-    }                                                                      \
+#define CAFFE2_RAND_UNIFORM_CHAR(T)                                      \
+  template <>                                                            \
+  C10_EXPORT void RandUniform<T, CPUContext>(                            \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) { \
+    at::uniform_int_from_to_distribution<short> distribution(            \
+        incrementIfNotMax(b - a), a);                                    \
+    for (size_t i = 0; i < n; ++i) {                                     \
+      r[i] = static_cast<T>(distribution(context->RandGenerator()));     \
+    }                                                                    \
   }
 CAFFE2_RAND_UNIFORM_CHAR(int8_t);
 CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
@@ -1095,7 +1108,10 @@ CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
   template <>                                                            \
   C10_EXPORT void RandUniform<T, CPUContext>(                            \
       const size_t n, const T a, const T b, T* r, CPUContext* context) { \
-    std::uniform_int_distribution<T> distribution(a, b);                 \
+    at::uniform_int_from_to_distribution<T> distribution(                \
+        incrementIfNotMax(                                               \
+            static_cast<uint64_t>(b) - static_cast<uint64_t>(a)),        \
+        a);                                                              \
     for (size_t i = 0; i < n; ++i) {                                     \
       r[i] = distribution(context->RandGenerator());                     \
     }                                                                    \
@@ -1135,7 +1151,7 @@ CAFFE2_RAND_UNIFORM_INT(uint64_t);
       auto remaining_numbers = n - 1 - i;                                 \
       double mean = (sum - current_sum) / (remaining_numbers + 1);        \
       double stdev = std::min(mean - a, b - mean);                        \
-      std::normal_distribution<double> distribution{mean, stdev / 4.0};   \
+      at::normal_distribution<double> distribution{mean, stdev / 4.0};    \
       T value, remaining_sum_test;                                        \
       do {                                                                \
         value = distribution(context->RandGenerator());                   \
@@ -1350,7 +1366,8 @@ CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
       CAFFE_ENFORCE_EQ(                                              \
           m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
     }                                                                \
-    std::uniform_int_distribution<T> distribution(a, b);             \
+    at::uniform_int_from_to_distribution<T> distribution(            \
+        incrementIfNotMax(b - a), a);                                \
     T v = 0;                                                         \
     for (size_t i = 0; i < n; ++i) {                                 \
       do {                                                           \
@@ -1372,7 +1389,7 @@ C10_EXPORT void RandGaussian<float, CPUContext>(
     const float std,
     float* r,
     CPUContext* context) {
-  std::normal_distribution<float> distribution(mean, std);
+  at::normal_distribution<float> distribution(mean, std);
   for (size_t i = 0; i < n; ++i) {
     r[i] = distribution(context->RandGenerator());
   }

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ template <typename T>`
`197`	`197`	`struct normal_distribution {`
`198`	`198`
`199`	`199`	`C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) {`
`200`		`- TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);`
	`200`	`+ TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);`
`201`	`201`	`mean = mean_in;`
`202`	`202`	`stdv = stdv_in;`
`203`	`203`	`}`
Original file line number	Diff line number	Diff line change
`@@ -92,6 +92,7 @@ class FillerOp : public Operator<Context> {`
`92`	`92`	`}`
`93`	`93`	`shape.insert(shape.end(), extra_shape_.begin(), extra_shape_.end());`
`94`	`94`	`output->Resize(shape);`
	`95`	`+ shape_ = shape;`
`95`	`96`	`} else {`
`96`	`97`	`output->Resize(shape_);`
`97`	`98`	`}`