-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOptimize1x4.cpp
104 lines (97 loc) · 2.34 KB
/
Optimize1x4.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "helpler.h"
#include "Optimize1x4.h"
// C=A*B+C
// A(m*n)
// B(nk)
// C(m*k)
void Optimize3_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize4_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_inline(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize5_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_fused(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize6_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_register(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize7_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_reduce_B_indexing(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize8_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_unroll_k(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}
void Optimize9_1x4( float* A,
float* B,
float* C,
int m,
int n,
int k ){
for(int i=0;i<m;++i){
for(int j=0;j<n;j+=4){
AddDot1x4_reduce_ptrB_update(&A(i,0), &B(0,j), &C(i,j), m, n, k);
}
}
return;
}