forked from cornell-cs5220-f15/matmul-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdgemm_avx_noblock.c
133 lines (100 loc) · 3.68 KB
/
dgemm_avx_noblock.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#include "immintrin.h"
#include <string.h>
const char* dgemm_desc = "AVX + copy + padded dgemm.";
#ifndef REGISTER_SIZE
#define REGISTER_SIZE ((int) 4)
#endif
/*
lda is the leading dimension of the matrix (the M of square_dgemm).
*/
void dgemm_4x4(const int lda, const double * restrict A,
const double * restrict B, double * restrict C)
{
//A, B, C are pointers to the relative positions where the blocks begin
__m256d c_0 = _mm256_load_pd(C + 0);
__m256d c_1 = _mm256_load_pd(C + 1*lda);
__m256d c_2 = _mm256_load_pd(C + 2*lda);
__m256d c_3 = _mm256_load_pd(C + 3*lda);
//4 columns of A matrix
__m256d a_k;
//4 corresponding rows of matrix B
__m256d b_0k, b_1k, b_2k, b_3k;
//temp variables to store dot products
__m256d ctemp_0, ctemp_1, ctemp_2, ctemp_3;
for (int k = 0; k < lda; ++k) {
a_k = _mm256_load_pd(A + k*4);
b_0k = _mm256_broadcast_sd(B + k);
b_1k = _mm256_broadcast_sd(B + k + lda);
b_2k = _mm256_broadcast_sd(B + k + 2*lda);
b_3k = _mm256_broadcast_sd(B + k + 3*lda);
ctemp_0 = _mm256_mul_pd(a_k, b_0k);
ctemp_1 = _mm256_mul_pd(a_k, b_1k);
ctemp_2 = _mm256_mul_pd(a_k, b_2k);
ctemp_3 = _mm256_mul_pd(a_k, b_3k);
c_0 = _mm256_add_pd(c_0, ctemp_0);
c_1 = _mm256_add_pd(c_1, ctemp_1);
c_2 = _mm256_add_pd(c_2, ctemp_2);
c_3 = _mm256_add_pd(c_3, ctemp_3);
}
_mm256_store_pd(C+0, c_0);
_mm256_store_pd(C+1*lda, c_1);
_mm256_store_pd(C+2*lda, c_2);
_mm256_store_pd(C+3*lda, c_3);
}
void copy_and_pad(const int M, const double* X, const int M_padded, double* X_aligned) {
//copies matrix to a new location with zero padding
for (int i = 0; i < M_padded; ++i) {
for (int j = 0; j < M_padded; ++j) {
if ((i < M) && (j < M)) {
X_aligned[j + i*M_padded] = X[j + i*M];
} else {
X_aligned[j + i*M_padded] = 0;
}
}
}
}
void copy_unpad(const int M, double* X, const int M_padded, const double* X_aligned) {
for (int i = 0; i < M; ++i) {
for (int j = 0; j < M; ++j) {
X[j + i*M] = X_aligned[j + i*M_padded];
}
}
}
void tile_copy_and_pad(const int M, const double* restrict X, const int M_padded, const int X_col_offset, const int tile_width, double* restrict X_tile) {
for (int i = 0; i < M_padded; ++i) {
for (int j = 0; j < tile_width; ++j) {
if ((i < M) && (j+X_col_offset < M)) {
X_tile[j + i*tile_width] = X[j+X_col_offset + i*M];
} else {
X_tile[j + i*tile_width] = 0;
}
}
}
}
void square_dgemm(const int M, const double* A, const double* B, double* C)
{
//Round up M to the nearest multiple of the register size
const int M_padded = ( M/REGISTER_SIZE + (M%REGISTER_SIZE ? 1 : 0) ) * REGISTER_SIZE;
int block_i_offset, block_j_offset;
//Allocate memory for aligned copies
double* restrict B_aligned = _mm_malloc(M_padded * M_padded * sizeof(double), 32);
double* restrict C_aligned = _mm_malloc(M_padded * M_padded * sizeof(double), 32);
double* restrict A_tile = _mm_malloc(4 * M_padded * sizeof(double), 32);
//copy B into aligned memory with padding
copy_and_pad(M, B, M_padded, B_aligned);
memset(C_aligned, 0, M_padded * M_padded * sizeof(double));
for (block_i_offset = 0; block_i_offset < M_padded; block_i_offset += REGISTER_SIZE) {
//copy (4-by-M) tile of A into contiguous, aligned memory
tile_copy_and_pad(M, A, M_padded, block_i_offset, REGISTER_SIZE, A_tile);
//For each long tile, compute the product with B in 4-by-4 blocks
for (block_j_offset = 0; block_j_offset < M_padded; block_j_offset += REGISTER_SIZE) {
dgemm_4x4(M_padded, A_tile, B_aligned + block_j_offset*M_padded,
C_aligned + block_i_offset + block_j_offset*M_padded);
}
}
//copy output back into original matrix
copy_unpad(M, C, M_padded, C_aligned);
_mm_free(A_tile);
_mm_free(B_aligned);
_mm_free(C_aligned);
}