Skip to content

Commit

Permalink
Add fill_uniform_random_bits and use it to speed up tests/benchmarks.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 686310094
  • Loading branch information
dsharletg authored and xnnpack-bot committed Oct 16, 2024
1 parent 6a9e953 commit 0e576bd
Show file tree
Hide file tree
Showing 23 changed files with 92 additions and 206 deletions.
15 changes: 2 additions & 13 deletions bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ xnnpack_benchmark(
srcs = [
"%s.cc" % kernel.replace("_", "-"),
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS,
) for kernel in [
"xx_transposev",
Expand Down Expand Up @@ -293,7 +292,6 @@ xnnpack_benchmark(
srcs = [
"qs8-dwconv.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":dwconv",
"//:indirection",
Expand Down Expand Up @@ -487,7 +485,6 @@ xnnpack_benchmark(
"bgemm.h",
"x8-packq.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":packq_benchmark",
"//:allocator",
Expand All @@ -501,7 +498,6 @@ xnnpack_benchmark(
"packw-benchmark.h",
"x8-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
"//:allocator",
],
Expand All @@ -514,7 +510,6 @@ xnnpack_benchmark(
"packw-benchmark.h",
"qs8-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
"//:allocator",
],
Expand All @@ -527,7 +522,6 @@ xnnpack_benchmark(
"packw-benchmark.h",
"x16-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
"//:allocator",
],
Expand All @@ -540,7 +534,6 @@ xnnpack_benchmark(
"packw-benchmark.h",
"x32-packw.cc",
],
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
"//:allocator",
],
Expand Down Expand Up @@ -602,9 +595,7 @@ xnnpack_benchmark(
name = "convolution_bench",
srcs = ["convolution.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + [
"nowin32",
],
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)

Expand Down Expand Up @@ -649,8 +640,6 @@ xnnpack_benchmark(
name = "scaled_dot_product_attention_bench",
srcs = ["scaled-dot-product-attention.cc"],
copts = xnnpack_optional_tflite_copts(),
tags = xnnpack_slow_benchmark_tags() + [
"nowin32",
],
tags = xnnpack_slow_benchmark_tags() + ["nowin32"],
deps = OPERATOR_BENCHMARK_DEPS + xnnpack_optional_tflite_deps(),
)
3 changes: 1 addition & 2 deletions bench/average-pooling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,12 @@ static void xnnpack_average_pooling_qu8(benchmark::State& state, const char* net

std::random_device random_device;
auto rng = std::mt19937(random_device());
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));

const size_t output_height = (2 * padding_size + input_height - pooling_size) / stride + 1;
const size_t output_width = (2 * padding_size + input_width - pooling_size) / stride + 1;

xnnpack::Buffer<uint8_t> input(batch_size * input_height * input_width * channels + XNN_EXTRA_BYTES / sizeof(uint8_t));
std::generate(input.begin(), input.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
xnnpack::Buffer<uint8_t> output(batch_size * output_height * output_width * channels);

xnn_status status = xnn_initialize(nullptr /* allocator */);
Expand Down
3 changes: 1 addition & 2 deletions bench/channel-shuffle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,10 @@ static void channel_shuffle_x8(benchmark::State& state, const char* net) {

std::random_device random_device;
auto rng = std::mt19937(random_device());
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));

xnnpack::Buffer<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels);
xnnpack::Buffer<uint8_t> output(batch_size * groups * group_channels);
std::generate(input.begin(), input.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);

xnn_status status = xnn_initialize(nullptr /* allocator */);
if (status != xnn_status_success) {
Expand Down
11 changes: 4 additions & 7 deletions bench/convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));

const size_t output_pixel_stride = groups * group_output_channels;
const size_t input_pixel_stride = groups * group_input_channels;
Expand All @@ -62,9 +61,9 @@ void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

xnnpack::Buffer<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint8_t));
std::generate(input.begin(), input.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
xnnpack::Buffer<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(kernel.data(), kernel.size(), rng);
xnnpack::Buffer<int32_t> bias(groups * group_output_channels);
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
Expand Down Expand Up @@ -187,8 +186,6 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto i8rng = std::bind(
std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));

const size_t output_pixel_stride = groups * group_output_channels;
const size_t input_pixel_stride = groups * group_input_channels;
Expand All @@ -202,9 +199,9 @@ void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

xnnpack::Buffer<int8_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(int8_t));
std::generate(input.begin(), input.end(), std::ref(i8rng));
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
xnnpack::Buffer<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
xnnpack::fill_uniform_random_bits(kernel.data(), kernel.size(), rng);
xnnpack::Buffer<int32_t> bias(groups * group_output_channels);
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
Expand Down
7 changes: 2 additions & 5 deletions bench/deconvolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
auto u8rng = std::bind(
std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
std::ref(rng));

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
Expand All @@ -60,9 +57,9 @@ void xnnpack_deconvolution_qu8(benchmark::State& state, const char* net) {
const size_t output_width = std::max(stride_width * (input_width - 1) + adjustment + effective_kernel_width, padding_width) - padding_width;

xnnpack::Buffer<uint8_t> input(XNN_EXTRA_BYTES + batch_size * input_height * input_width * input_channels);
std::generate(input.begin(), input.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
xnnpack::Buffer<uint8_t> kernel(output_channels * kernel_height * kernel_width * input_channels);
std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(kernel.data(), kernel.size(), rng);
xnnpack::Buffer<int32_t> bias(output_channels);
std::generate(bias.begin(), bias.end(), std::ref(i32rng));
const size_t output_elements = batch_size * output_height * output_width * output_channels;
Expand Down
6 changes: 1 addition & 5 deletions bench/f32-qc4w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,11 @@ static void GEMMBenchmark(benchmark::State& state,
std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
auto u8rng = std::bind(
std::uniform_int_distribution<int32_t>(
std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
std::ref(rng));

xnnpack::Buffer<float> a(mc * kc + XNN_EXTRA_BYTES / sizeof(float));
std::generate(a.begin(), a.end(), std::ref(f32rng));
xnnpack::Buffer<uint8_t> k(nc * kc * sizeof(uint8_t) / 2 /* int4_t */);
std::generate(k.begin(), k.end(), std::ref(u8rng));
xnnpack::fill_uniform_random_bits(k.data(), k.size(), rng);
xnnpack::Buffer<float> b(nc);
std::generate(b.begin(), b.end(), std::ref(f32rng));

Expand Down
Loading

0 comments on commit 0e576bd

Please sign in to comment.