-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrows_to_tiles.cpp
156 lines (139 loc) · 5.32 KB
/
rows_to_tiles.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#include <bitset>
#include <cassert>
#include <chrono>
#include <iostream>
#include <limits.h>
#include <new>
#include <stdint.h>
#include <stdio.h>
#ifdef __AVX2__
#include <immintrin.h>
#endif
void PrintBitArray(uint64_t * arr, unsigned len){
for(unsigned i = 0; i < len; ++i)
std::cout << std::bitset<64>(arr[i]) << std::endl;
}
void PrintBlock(uint64_t num){
for(unsigned j = 0; j < 64; j += 8)
std::cout << std::bitset<8>(char(num >> j)) << std::endl;
std::cout << std::endl;
}
void EightByEightSubmatricesNaive(uint64_t * src, uint64_t * dest, unsigned len){
// assumes dest is initialized to all zeros.
for(uint64_t * srcPtr = src; srcPtr < src + len; ++srcPtr){
unsigned srcInd = (srcPtr - src) % 8;
uint64_t * destBlock = dest + 8*((srcPtr - src)/8);
for(unsigned destInd = 0; destInd < 8; ++destInd){
uint64_t * destPtr = destBlock + destInd;
*destPtr |= ((*srcPtr >> 8*destInd) & 0xFF) << (8*srcInd);
}
// num at srcPtr contributes the srcInd byte in each submatrix.
// destInd byte of that number belongs to submatrix at destPtr.
}
}
void EightByEightSubmatricesAVX(uint64_t * src, uint64_t * dest, unsigned len){
assert(len % 8 == 0);
#ifdef __AVX2__
uint64_t * p = src;
uint64_t * endp = src+len;
uint64_t * q = dest;
do {
// process 8 rows at a time, split into two __m256i's halves.
// within each half, move lower 32 of each uint64
// into bottom 128, upper 32 into top 128.
__m256i gatherBy = _mm256_set_epi32(7,5,3,1,6,4,2,0);
__m256i permuted1 = _mm256_i32gather_epi32 (p, gatherBy, 4);
__m256i permuted2 = _mm256_i32gather_epi32 (p+32/sizeof(p[0]), gatherBy, 4);
// bring together bytes in the same position.
__m256i shuffleBy = _mm256_set_epi8(15,11,7,3,
14,10,6,2,
13, 9,5,1,
12, 8,4,0,
15,11,7,3,
14,10,6,2,
13, 9,5,1,
12, 8,4,0);
__m256i temp[2];
temp[0] = _mm256_shuffle_epi8(permuted1, shuffleBy);
temp[1] = _mm256_shuffle_epi8(permuted2, shuffleBy);
// top halves of the 8 submatrices are now in temp[0],
// bottom halves in temp[1].
// bring together top and bottom halves.
__m256i firstFourBlocks = _mm256_i32gather_epi32(temp,
_mm256_set_epi32(11,3,10,2,9,1,8,0), 4);
__m256i secondFourBlocks = _mm256_i32gather_epi32(temp,
_mm256_set_epi32(15,7,14,6,13,5,12,4), 4);
_mm256_storeu_si256((__m256i*)q, firstFourBlocks);
_mm256_storeu_si256((__m256i*)(q+32/sizeof(q[0])), secondFourBlocks);
p += 64/sizeof(p[0]);
q += 64/sizeof(q[0]);
// a combination of _mm256_permutevar8x32_epi32
// and _mm256_shuffle_epi8 would also work,
// but in this case a gather is simpler (and faster).
} while (p < endp);
#else
std::cout << "AVX2 not supported" << std::endl;
exit(1);
#endif
}
int main(int, char *argv[]){
static const bool DEBUG = false;
static const unsigned M = 16;
if (DEBUG){
srand (time(NULL));
uint64_t test[M] = {0};
uint64_t avxResult[M] = {0};
uint64_t naiveResult[M] = {0};
for(unsigned i = 0; i < M; ++i)
test[i] = uint64_t(rand()) | (uint64_t(rand()) << 32);
EightByEightSubmatricesAVX(test, avxResult, M);
EightByEightSubmatricesNaive(test, naiveResult, M);
for (unsigned i = 0; i < M; ++i){
if(avxResult[i] != naiveResult[i]){
std::cout << "issue at submatrix " << i << std::endl;
PrintBitArray(test, M);
std::cout << "expected" << std::endl;
PrintBlock(naiveResult[i]);
std::cout << "actual" << std::endl;
PrintBlock(avxResult[i]);
assert(false);
}
}
}
uint64_t demo[M];
uint64_t submatrices[M] = {0};
// hand-chosen numbers: enough variation that the submatrices are different,
// but not so much that it's chaotic and random-looking.
for (unsigned i = 0; i < 4; ++i)
demo[i] = 0xFF0FFF0F00FF00FFULL;
for (unsigned i = 4; i < 8; ++i)
demo[i] = 0xFF00FF00F0FFF0FFULL;
for (unsigned i = 8; i < 16; ++i)
demo[i] = 0x00FF00FF00FF00FFULL << i;
std::cout << "original bit array:" << std::endl;
PrintBitArray(demo, M);
std::cout << "8-by-8 submatrix blocks, right to left:" << std::endl;
EightByEightSubmatricesAVX(demo, submatrices, M);
for(unsigned i = 0; i < M; ++i)
PrintBlock(submatrices[i]);
#define N 100000
assert(8*N < UINT_MAX);
uint64_t * src = new uint64_t[8*N];
srand (time(NULL));
for(unsigned i = 0; i < 8*N; ++i)
src[i] = uint64_t(rand()) | (uint64_t(rand()) << 32);
std::cout << "time to apply to matrix of " << 8*N << " rows" << std::endl;
for(unsigned n = 0; n < 2; ++n){
uint64_t * dest = new uint64_t[8*N]();
auto startTime = std::chrono::high_resolution_clock::now();
n == 0 ? EightByEightSubmatricesNaive(src, dest, N) :
EightByEightSubmatricesAVX(src, dest, N);
auto endTime = std::chrono::high_resolution_clock::now();
n == 0 ? std::cout << "without AVX: " : std::cout << "with AVX: ";
std::chrono::duration<double, std::milli> passed = endTime - startTime;
std::cout << passed.count() << " ms " << std::endl;
delete[] dest;
}
delete[] src;
return 0;
}