-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfalse_sharing.cpp
103 lines (89 loc) · 3.32 KB
/
false_sharing.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// Conclusion:
// Data aligned to the cache line boundry avoids false sharing situation
// Also, higher generation CPUs doesn't necessarly mean that they give more performance.
// For example, I tested this code on intel Core i3 8350U, it took around 2 milliseconds for test1(),
// and on Core i5 12400, it took around 2 milliseconds for test1() - same (infact sometimes it took longer).
#include <iostream>
#include <thread>
#include <new>
#include <cstdint>
#include <atomic>
static constexpr std::size_t ITERATION_COUNT = 10000;
static constexpr std::size_t TEST_RUN_COUNT = 10;
struct Data1
{
std::atomic<std::uint32_t> value1;
std::atomic<std::uint32_t> value2;
} __attribute__((aligned(std::hardware_destructive_interference_size)));
static Data1 gData1;
struct Data2
{
__attribute__((aligned(std::hardware_destructive_interference_size))) std::atomic<std::uint32_t> value1;
__attribute__((aligned(std::hardware_destructive_interference_size))) std::atomic<std::uint32_t> value2;
} __attribute__((aligned(std::hardware_destructive_interference_size)));
static Data2 gData2;
template<std::memory_order memory_order>
static void test1() noexcept
{
auto start = std::chrono::steady_clock::now();
std::thread thread1([](Data1& data1) noexcept
{
for(std::size_t i = 0; i < ITERATION_COUNT; ++i)
data1.value1.fetch_add(1, memory_order);
}, std::ref(gData1));
std::thread thread2([](Data1& data1) noexcept
{
for(std::size_t i = 0; i < ITERATION_COUNT; ++i)
data1.value2.fetch_add(1, memory_order);
}, std::ref(gData1));
thread1.join();
thread2.join();
auto end = std::chrono::steady_clock::now();
float elapsedTime = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(end - start).count();
std::cout << "test1: " << elapsedTime << std::endl;
}
template<std::memory_order memory_order>
static void test2() noexcept
{
auto start = std::chrono::steady_clock::now();
std::thread thread1([](Data2& data2) noexcept
{
for(std::size_t i = 0; i < ITERATION_COUNT; ++i)
data2.value1.fetch_add(1, memory_order);
}, std::ref(gData2));
std::thread thread2([](Data2& data2) noexcept
{
for(std::size_t i = 0; i < ITERATION_COUNT; ++i)
data2.value2.fetch_add(1, memory_order);
}, std::ref(gData2));
thread1.join();
thread2.join();
auto end = std::chrono::steady_clock::now();
float elapsedTime = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(end - start).count();
std::cout << "test2: " << elapsedTime << std::endl;
}
int main()
{
std::cout << "hardware concurrency: " << std::thread::hardware_concurrency() << std::endl;
std::cout << "destructive interference size: " << std::hardware_destructive_interference_size << std::endl;
std::cout << "sizeof(std::atomic<std::uint32_t>): " << sizeof(std::atomic<std::uint32_t>) << std::endl;
gData1.value1.store(0);
gData1.value2.store(0);
gData2.value1.store(0);
gData2.value2.store(0);
std::cout << "memory order: sequentially consistent" << std::endl;
for(std::size_t i = 0; i < TEST_RUN_COUNT; ++i)
{
std::cout << "Run: " << i << std::endl;
test1<std::memory_order_seq_cst>();
test2<std::memory_order_seq_cst>();
}
std::cout << "memory order: relaxed" << std::endl;
for(std::size_t i = 0; i < TEST_RUN_COUNT; ++i)
{
std::cout << "Run: " << i << std::endl;
test1<std::memory_order_relaxed>();
test2<std::memory_order_relaxed>();
}
return 0;
}