Skip to content

Commit 0c9787c

Browse files
d4l3kfacebook-github-bot
authored andcommitted
caffe2: use at::mt19937 instead of std::mt19937 (10x speedup) (pytorch#43987)
Summary: Pull Request resolved: pytorch#43987 This replaces the caffe2 CPU random number (std::mt19937) with at::mt19937 which is the one currently used in pytorch. The ATen RNG is 10x faster than the std one and appears to be more robust given bugs in the std (https://fburl.com/diffusion/uhro7lqb) For large embedding tables (10GB+) we see UniformFillOp taking upwards of 10 minutes as we're bottlenecked on the single threaded RNG. Swapping to at::mt19937 cuts that time to 10% of the current. Test Plan: Ran all relevant tests + CI. This doesn't introduce new features (+ is a core change) so existing tests+CI should be sufficient to catch regressions. Reviewed By: dzhulgakov Differential Revision: D23219710 fbshipit-source-id: bd16ed6415b2933e047bcb283a013d47fb395814
1 parent e6e83bf commit 0c9787c

10 files changed

+137
-34
lines changed

aten/src/ATen/core/DistributionsHelper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ template <typename T>
197197
struct normal_distribution {
198198

199199
C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) {
200-
TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);
200+
TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);
201201
mean = mean_in;
202202
stdv = stdv_in;
203203
}

caffe2/core/context.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515

1616
#include <c10/util/ArrayRef.h>
1717

18+
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
19+
#include <ATen/core/DistributionsHelper.h>
20+
#include <ATen/CPUGeneratorImpl.h>
21+
#else
22+
#include "caffe2/core/distributions_stubs.h"
23+
#endif
24+
1825
C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
1926

2027
namespace caffe2 {
@@ -39,7 +46,12 @@ CAFFE2_API uint32_t RandomNumberSeed();
3946
*/
4047
class CAFFE2_API CPUContext final : public BaseContext {
4148
public:
49+
#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
50+
typedef at::CPUGeneratorImpl rand_gen_type;
51+
#else
4252
typedef std::mt19937 rand_gen_type;
53+
#endif
54+
4355
CPUContext() {}
4456
explicit CPUContext(const DeviceOption& option)
4557
: random_seed_(option.has_random_seed() ? option.random_seed() : 1701),
@@ -66,11 +78,11 @@ class CAFFE2_API CPUContext final : public BaseContext {
6678

6779
inline void FinishDeviceComputation() override {}
6880

69-
inline rand_gen_type& RandGenerator() {
81+
inline rand_gen_type* RandGenerator() {
7082
if (!random_generator_.get()) {
7183
random_generator_.reset(new rand_gen_type(RandSeed()));
7284
}
73-
return *random_generator_.get();
85+
return random_generator_.get();
7486
}
7587

7688
inline uint32_t RandSeed() {

caffe2/core/distributions_stubs.h

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#ifndef CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_
2+
#define CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_
3+
4+
#include <c10/macros/Macros.h>
5+
6+
/**
7+
* This file provides distributions compatible with
8+
* ATen/core/DistributionsHelper.h but backed with the std RNG implementation
9+
* instead of the ATen one.
10+
*
11+
* Caffe2 mobile builds currently do not depend on all of ATen so this is
12+
* required to allow using the faster ATen RNG for normal builds but keep the
13+
* build size small on mobile. RNG performance typically doesn't matter on
14+
* mobile builds since the models are small and rarely using random
15+
* initialization.
16+
*/
17+
18+
namespace at {
19+
namespace {
20+
21+
template <typename R, typename T>
22+
struct distribution_adapter {
23+
template <typename... Args>
24+
C10_HOST_DEVICE inline distribution_adapter(Args... args)
25+
: distribution_(std::forward<Args>(args)...) {}
26+
27+
template <typename RNG>
28+
C10_HOST_DEVICE inline R operator()(RNG generator) {
29+
return distribution_(*generator);
30+
}
31+
32+
private:
33+
T distribution_;
34+
};
35+
36+
template <typename T>
37+
struct uniform_int_from_to_distribution
38+
: distribution_adapter<T, std::uniform_int_distribution<T>> {
39+
C10_HOST_DEVICE inline uniform_int_from_to_distribution(
40+
uint64_t range,
41+
int64_t base)
42+
: distribution_adapter<T, std::uniform_int_distribution<T>>(
43+
base,
44+
// std is inclusive, at is exclusive
45+
base + range - 1) {}
46+
};
47+
48+
template <typename T>
49+
using uniform_real_distribution =
50+
distribution_adapter<T, std::uniform_real_distribution<T>>;
51+
52+
template <typename T>
53+
using normal_distribution =
54+
distribution_adapter<T, std::normal_distribution<T>>;
55+
56+
template <typename T>
57+
using bernoulli_distribution =
58+
distribution_adapter<T, std::bernoulli_distribution>;
59+
60+
template <typename T>
61+
using exponential_distribution =
62+
distribution_adapter<T, std::exponential_distribution<T>>;
63+
64+
template <typename T>
65+
using cauchy_distribution =
66+
distribution_adapter<T, std::cauchy_distribution<T>>;
67+
68+
template <typename T>
69+
using lognormal_distribution =
70+
distribution_adapter<T, std::lognormal_distribution<T>>;
71+
72+
} // namespace
73+
} // namespace at
74+
75+
#endif // CAFFE2_CORE_DISTRIBUTIONS_STUBS_H_

caffe2/operators/dataset_ops.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -987,10 +987,9 @@ class CollectTensorOp final : public Operator<Context> {
987987
// append
988988
pos = numVisited_;
989989
} else {
990-
auto& gen = context_.RandGenerator();
991990
// uniform between [0, numVisited_]
992-
std::uniform_int_distribution<int> uniformDist(0, numVisited_);
993-
pos = uniformDist(gen);
991+
at::uniform_int_from_to_distribution<int> uniformDist(numVisited_+1, 0);
992+
pos = uniformDist(context_.RandGenerator());
994993
if (pos >= numToCollect_) {
995994
// discard
996995
pos = -1;

caffe2/operators/dropout_op.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
1717
float scale = 1. / (1. - ratio_);
1818
// mask=true means keep, and mask=false means not keep, so we will
1919
// generate probability depending on 1-ratio.
20-
std::bernoulli_distribution dist(1. - ratio_);
20+
at::bernoulli_distribution<double> dist(1. - ratio_);
2121
const float* Xdata = X.data<float>();
2222
float* Ydata = Y->template mutable_data<float>();
2323

2424
auto mask = Output(1, X.sizes(), at::dtype<bool>());
2525
bool* mask_data = mask->template mutable_data<bool>();
26-
auto& gen = context_.RandGenerator();
26+
auto* gen = context_.RandGenerator();
2727
for (int i = 0; i < X.numel(); ++i) {
28-
mask_data[i] = dist(gen);
28+
mask_data[i] = dist(gen) > 0.5;
2929
Ydata[i] = Xdata[i] * scale * mask_data[i];
3030
}
3131
return true;

caffe2/operators/filler_op.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class FillerOp : public Operator<Context> {
9292
}
9393
shape.insert(shape.end(), extra_shape_.begin(), extra_shape_.end());
9494
output->Resize(shape);
95+
shape_ = shape;
9596
} else {
9697
output->Resize(shape_);
9798
}

caffe2/operators/reservoir_sampling.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,9 @@ class ReservoirSamplingOp final : public Operator<Context> {
150150
// append
151151
pos = *num_visited;
152152
} else {
153-
auto& gen = context_.RandGenerator();
154153
// uniform between [0, num_visited]
155-
std::uniform_int_distribution<int64_t> uniformDist(0, *num_visited);
156-
pos = uniformDist(gen);
154+
at::uniform_int_from_to_distribution<int64_t> uniformDist(*num_visited+1, 0);
155+
pos = uniformDist(context_.RandGenerator());
157156
if (pos >= numToCollect_) {
158157
// discard
159158
pos = -1;

caffe2/operators/sparse_dropout_with_replacement_op.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ bool SparseDropoutWithReplacementOp<CPUContext>::RunOnDevice() {
2626
X.numel(),
2727
"Inconsistent input data. Number of elements should match total length.");
2828

29-
std::bernoulli_distribution dist(1. - ratio_);
30-
auto& gen = context_.RandGenerator();
29+
at::bernoulli_distribution<double> dist(1. - ratio_);
30+
auto* gen = context_.RandGenerator();
3131
int32_t total_output_length = 0;
3232
vector<bool> selected(Lengths.numel(), true);
3333
for (int i = 0; i < Lengths.numel(); ++i) {
34-
if (dist(gen)) {
34+
if (dist(gen) > 0.5) {
3535
output_lengths_data[i] = input_lengths_data[i];
3636
} else {
3737
// Replace with a single dropout value. Even if input length is 0.

caffe2/predictor/predictor_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ TEST_F(PredictorTest, SimpleBatchSized) {
187187
EXPECT_EQ(output.front().sizes().size(), 2);
188188
EXPECT_EQ(output.front().size(0), 1);
189189
EXPECT_EQ(output.front().size(1), 10);
190-
EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
190+
EXPECT_NEAR(output.front().data<float>()[4], 4.9556, 1E-4);
191191
}
192192

193193
TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
@@ -202,7 +202,7 @@ TEST_F(PredictorTest, SimpleBatchSizedMapInput) {
202202
EXPECT_EQ(output.front().sizes().size(), 2);
203203
EXPECT_EQ(output.front().size(0), 1);
204204
EXPECT_EQ(output.front().size(1), 10);
205-
EXPECT_NEAR(output.front().data<float>()[4], 0.1209, 1E-4);
205+
EXPECT_NEAR(output.front().data<float>()[4], 4.9556, 1E-4);
206206
}
207207

208208
} // namespace caffe2

caffe2/utils/math_cpu.cc

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ C10_EXPORT void Gemm<float, CPUContext>(
110110
return;
111111
default:
112112
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
113-
return; // The line above calls `abort()`. Should never reach here.
113+
return; // The line above calls `abort()`. Should never reach here.
114114
}
115115
}
116116
case CblasTrans: {
@@ -127,7 +127,7 @@ C10_EXPORT void Gemm<float, CPUContext>(
127127
return;
128128
default:
129129
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
130-
return; // The line above calls `abort()`. Should never reach here.
130+
return; // The line above calls `abort()`. Should never reach here.
131131
}
132132
}
133133
default:
@@ -177,7 +177,7 @@ C10_EXPORT void GemmEx<float, CPUContext>(
177177
return;
178178
default:
179179
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
180-
return; // The line above calls `abort()`. Should never reach here.
180+
return; // The line above calls `abort()`. Should never reach here.
181181
}
182182
}
183183
case CblasTrans: {
@@ -201,7 +201,7 @@ C10_EXPORT void GemmEx<float, CPUContext>(
201201
return;
202202
default:
203203
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
204-
return; // The line above calls `abort()`. Should never reach here.
204+
return; // The line above calls `abort()`. Should never reach here.
205205
}
206206
}
207207
default:
@@ -1065,11 +1065,23 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
10651065

10661066
#undef DELEGATE_BROADCAST_BINARY_FUNCTION
10671067

1068+
namespace {
1069+
// incrementIfNotMax increments the number if the value is not max for that
1070+
// datatype. This ensures that the value never overflows.
1071+
template <typename T>
1072+
inline T incrementIfNotMax(T a) {
1073+
if (a == std::numeric_limits<T>::max()) {
1074+
return a;
1075+
}
1076+
return a + 1;
1077+
}
1078+
} // namespace
1079+
10681080
#define CAFFE2_RAND_UNIFORM_REAL(T) \
10691081
template <> \
10701082
C10_EXPORT void RandUniform<T, CPUContext>( \
10711083
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1072-
std::uniform_real_distribution<T> distribution(a, b); \
1084+
at::uniform_real_distribution<T> distribution(a, b); \
10731085
for (size_t i = 0; i < n; ++i) { \
10741086
r[i] = distribution(context->RandGenerator()); \
10751087
} \
@@ -1078,14 +1090,15 @@ CAFFE2_RAND_UNIFORM_REAL(float);
10781090
CAFFE2_RAND_UNIFORM_REAL(double);
10791091
#undef CAFFE2_RAND_UNIFORM_REAL
10801092

1081-
#define CAFFE2_RAND_UNIFORM_CHAR(T) \
1082-
template <> \
1083-
C10_EXPORT void RandUniform<T, CPUContext>( \
1084-
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1085-
std::uniform_int_distribution<short> distribution((short)a, (short)b); \
1086-
for (size_t i = 0; i < n; ++i) { \
1087-
r[i] = static_cast<T>(distribution(context->RandGenerator())); \
1088-
} \
1093+
#define CAFFE2_RAND_UNIFORM_CHAR(T) \
1094+
template <> \
1095+
C10_EXPORT void RandUniform<T, CPUContext>( \
1096+
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1097+
at::uniform_int_from_to_distribution<short> distribution( \
1098+
incrementIfNotMax(b - a), a); \
1099+
for (size_t i = 0; i < n; ++i) { \
1100+
r[i] = static_cast<T>(distribution(context->RandGenerator())); \
1101+
} \
10891102
}
10901103
CAFFE2_RAND_UNIFORM_CHAR(int8_t);
10911104
CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
@@ -1095,7 +1108,10 @@ CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
10951108
template <> \
10961109
C10_EXPORT void RandUniform<T, CPUContext>( \
10971110
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
1098-
std::uniform_int_distribution<T> distribution(a, b); \
1111+
at::uniform_int_from_to_distribution<T> distribution( \
1112+
incrementIfNotMax( \
1113+
static_cast<uint64_t>(b) - static_cast<uint64_t>(a)), \
1114+
a); \
10991115
for (size_t i = 0; i < n; ++i) { \
11001116
r[i] = distribution(context->RandGenerator()); \
11011117
} \
@@ -1135,7 +1151,7 @@ CAFFE2_RAND_UNIFORM_INT(uint64_t);
11351151
auto remaining_numbers = n - 1 - i; \
11361152
double mean = (sum - current_sum) / (remaining_numbers + 1); \
11371153
double stdev = std::min(mean - a, b - mean); \
1138-
std::normal_distribution<double> distribution{mean, stdev / 4.0}; \
1154+
at::normal_distribution<double> distribution{mean, stdev / 4.0}; \
11391155
T value, remaining_sum_test; \
11401156
do { \
11411157
value = distribution(context->RandGenerator()); \
@@ -1350,7 +1366,8 @@ CAFFE2_RAND_SYNTHETIC_DATA(uint64_t);
13501366
CAFFE_ENFORCE_EQ( \
13511367
m, avoid_set.size(), "AC10_EXPORT void should be unique"); \
13521368
} \
1353-
std::uniform_int_distribution<T> distribution(a, b); \
1369+
at::uniform_int_from_to_distribution<T> distribution( \
1370+
incrementIfNotMax(b - a), a); \
13541371
T v = 0; \
13551372
for (size_t i = 0; i < n; ++i) { \
13561373
do { \
@@ -1372,7 +1389,7 @@ C10_EXPORT void RandGaussian<float, CPUContext>(
13721389
const float std,
13731390
float* r,
13741391
CPUContext* context) {
1375-
std::normal_distribution<float> distribution(mean, std);
1392+
at::normal_distribution<float> distribution(mean, std);
13761393
for (size_t i = 0; i < n; ++i) {
13771394
r[i] = distribution(context->RandGenerator());
13781395
}

0 commit comments

Comments
 (0)