Skip to content

Commit 4ec95c2

Browse files
authored
[New lab] Added a new lab about SW memory prefetching (#17)
Authored-by: @ibogosavljevic
1 parent 2ad129d commit 4ec95c2

File tree

8 files changed

+163
-0
lines changed

8 files changed

+163
-0
lines changed

buildbot/runCI.py

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class LabPath:
5757
Labs["memory_bound"]["data_packing"] = LabParams(threshold=13.0)
5858
Labs["memory_bound"]["loop_interchange_1"] = LabParams(threshold=85.0)
5959
Labs["memory_bound"]["loop_interchange_2"] = LabParams(threshold=75.0)
60+
Labs["memory_bound"]["swmem_prefetch_1"] = LabParams(threshold=30.0)
6061
Labs["misc"]["warmup"] = LabParams(threshold=50.0)
6162
Labs["core_bound"]["function_inlining_1"] = LabParams(threshold=35.0)
6263
Labs["core_bound"]["compiler_intrinsics_1"] = LabParams(threshold=60.0)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cmake_minimum_required(VERSION 2.8.12)
2+
3+
project(lab)
4+
5+
# Optional program arguments, for example:
6+
# set(VALIDATE_ARGS "${CMAKE_CURRENT_SOURCE_DIR}/input.file" "${CMAKE_CURRENT_SOURCE_DIR}/output.file")
7+
# set(LAB_ARGS "${CMAKE_CURRENT_SOURCE_DIR}/input.file" "output.file")
8+
9+
string(REGEX MATCH "^(.*)[\\/]labs[\\/].*$" repo "${CMAKE_CURRENT_SOURCE_DIR}")
10+
include(${CMAKE_MATCH_1}/tools/labs.cmake)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Software memory prefetching
2+
3+
When the CPU data prefetcher cannot figure out the memory access pattern, software prefetching comes in handy. The idea is to use special instructions that tell the CPU: "Hey, I plan to use this memory location a bit later, could you fetch it for me while I do other stuff so it waits for me when I am back".
4+
5+
In GCC and CLANG, you can use `__builtin_prefetch` to ask the CPU to prefetch data. Say, for example, that you are going to access an element of array `my_array[index]`, where `index` is some random number. To prefetch it, you will use `__builtin_prefetch(&my_array[index]);` or `__builtin_prefetch(&my_array + index);`.
6+
7+
Prefetching can benefit the performance, but it can also hurt the performance. It benefits it if the piece of data you are trying to access is not in the data cache. It hurts it if it is. So most of the time, it pays off when there are random memory accesses on a large data structure, such as a tree or a hash map.
8+
9+
An additional prerequisite for the speedup with prefetching is that between the time you request prefetching, and the time you actually access your data, some time needs to pass (known as "prefetching window"). Immediately accessing data that you want to prefetch will not give the expected results.
10+
11+
Authored-by: @ibogosavljevic
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
#include "benchmark/benchmark.h"
3+
#include "solution.hpp"
4+
#include <memory>
5+
6+
static void bench1(benchmark::State &state) {
7+
// Init benchmark data
8+
auto hash_map = std::make_unique<hash_map_t>(HASH_MAP_SIZE);
9+
std::vector<int> lookups;
10+
lookups.resize(NUMBER_OF_LOOKUPS);
11+
init(hash_map.get(), lookups);
12+
13+
// Run the benchmark
14+
for (auto _ : state) {
15+
auto output = solution(hash_map.get(), lookups);
16+
benchmark::DoNotOptimize(output);
17+
}
18+
}
19+
20+
// Register the function as a benchmark and measure time in microseconds
21+
BENCHMARK(bench1)->Unit(benchmark::kMillisecond);
22+
23+
// Run the benchmark
24+
BENCHMARK_MAIN();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include "solution.hpp"
2+
#include <limits>
3+
#include <random>
4+
5+
void init(hash_map_t *hash_map, std::vector<int> &lookups) {
6+
std::default_random_engine generator;
7+
std::uniform_int_distribution<int> distribution(
8+
0, std::numeric_limits<int>::max());
9+
for (int i = 0; i < HASH_MAP_SIZE; i++) {
10+
hash_map->insert(distribution(generator));
11+
}
12+
13+
for (int i = 0; i < NUMBER_OF_LOOKUPS; i++) {
14+
lookups.push_back(distribution(generator));
15+
}
16+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#include "solution.hpp"
2+
3+
static int getSumOfDigits(int n) {
4+
int sum = 0;
5+
while (n != 0) {
6+
sum = sum + n % 10;
7+
n = n / 10;
8+
}
9+
return sum;
10+
}
11+
12+
int solution(const hash_map_t *hash_map, const std::vector<int> &lookups) {
13+
int result = 0;
14+
15+
for (int val : lookups) {
16+
if (hash_map->find(val))
17+
result += getSumOfDigits(val);
18+
}
19+
20+
return result;
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include <vector>
2+
#include <limits>
3+
4+
static constexpr size_t HASH_MAP_SIZE = 32 * 1024 * 1024 - 5;
5+
static constexpr size_t NUMBER_OF_LOOKUPS = 1024 * 1024;
6+
7+
class hash_map_t {
8+
static constexpr int UNUSED = std::numeric_limits<int>::max();
9+
std::vector<int> m_vector;
10+
size_t N_Buckets;
11+
public:
12+
hash_map_t(size_t size) : m_vector(size, UNUSED), N_Buckets(size) {}
13+
14+
bool insert(int val) {
15+
int bucket = val % N_Buckets;
16+
if (m_vector[bucket] == UNUSED) {
17+
m_vector[bucket] = val;
18+
return true;
19+
}
20+
return false;
21+
}
22+
23+
bool find(int val) const {
24+
int bucket = val % N_Buckets;
25+
return m_vector[bucket] != UNUSED;
26+
}
27+
28+
void prefetchForVal(int val) const {
29+
int bucket = val % N_Buckets;
30+
__builtin_prefetch(&m_vector[bucket]);
31+
}
32+
};
33+
34+
void init(hash_map_t* hash_map, std::vector<int>& lookups);
35+
int solution(const hash_map_t* hash_map, const std::vector<int>& lookups);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
2+
#include "solution.hpp"
3+
#include <iostream>
4+
#include <memory>
5+
6+
static int getSumOfDigits(int n) {
7+
int sum = 0;
8+
while (n != 0) {
9+
sum = sum + n % 10;
10+
n = n / 10;
11+
}
12+
return sum;
13+
}
14+
15+
static int original_solution(const hash_map_t *hash_map,
16+
const std::vector<int> &lookups) {
17+
int result = 0;
18+
19+
for (int val : lookups) {
20+
if (hash_map->find(val))
21+
result += getSumOfDigits(val);
22+
}
23+
24+
return result;
25+
}
26+
27+
int main() {
28+
// Init benchmark data
29+
auto hash_map = std::make_unique<hash_map_t>(HASH_MAP_SIZE);
30+
std::vector<int> lookups;
31+
lookups.resize(NUMBER_OF_LOOKUPS);
32+
init(hash_map.get(), lookups);
33+
34+
auto original_result = original_solution(hash_map.get(), lookups);
35+
auto result = solution(hash_map.get(), lookups);
36+
37+
if (original_result != result) {
38+
std::cerr << "Validation Failed. Original result = " << original_result
39+
<< "; Modified version returned = " << result << "\n";
40+
return 1;
41+
}
42+
43+
std::cout << "Validation Successful" << std::endl;
44+
return 0;
45+
}

0 commit comments

Comments
 (0)