Skip to content

Commit 230e665

Browse files
authored
Merge pull request #4996 from iha-taisei/sdgemv_sve_unroll
Loop-unrolled transposed [SD]GEMV kernels for A64FX and Neoverse V1
2 parents 5dc4d7d + 4918bee commit 230e665

File tree

4 files changed

+390
-4
lines changed

4 files changed

+390
-4
lines changed

kernel/arm64/KERNEL.A64FX

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE
22

33
SGEMVNKERNEL = gemv_n_sve.c
44
DGEMVNKERNEL = gemv_n_sve.c
5-
SGEMVTKERNEL = gemv_t_sve.c
6-
DGEMVTKERNEL = gemv_t_sve.c
5+
SGEMVTKERNEL = gemv_t_sve_v4x3.c
6+
DGEMVTKERNEL = gemv_t_sve_v4x3.c

kernel/arm64/KERNEL.NEOVERSEV1

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
include $(KERNELDIR)/KERNEL.ARMV8SVE
22

3-
SGEMVTKERNEL = gemv_t_sve.c
4-
DGEMVTKERNEL = gemv_t_sve.c
3+
SGEMVTKERNEL = gemv_t_sve_v1x3.c
4+
DGEMVTKERNEL = gemv_t_sve_v1x3.c

kernel/arm64/gemv_t_sve_v1x3.c

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/***************************************************************************
2+
Copyright (c) 2024, The OpenBLAS Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in
14+
the documentation and/or other materials provided with the
15+
distribution.
16+
3. Neither the name of the OpenBLAS project nor the names of
17+
its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written
19+
permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
*****************************************************************************/
32+
33+
#include <arm_sve.h>
34+
35+
#include "common.h"
36+
37+
#ifdef DOUBLE
38+
#define SV_COUNT svcntd
39+
#define SV_TYPE svfloat64_t
40+
#define SV_TRUE svptrue_b64
41+
#define SV_WHILE svwhilelt_b64_s64
42+
#define SV_DUP svdup_f64
43+
#else
44+
#define SV_COUNT svcntw
45+
#define SV_TYPE svfloat32_t
46+
#define SV_TRUE svptrue_b32
47+
#define SV_WHILE svwhilelt_b32_s64
48+
#define SV_DUP svdup_f32
49+
#endif
50+
51+
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
52+
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
53+
FLOAT *buffer)
54+
{
55+
BLASLONG i;
56+
BLASLONG ix,iy;
57+
BLASLONG j;
58+
FLOAT *a_ptr;
59+
FLOAT temp;
60+
61+
iy = 0;
62+
63+
if (inc_x == 1) {
64+
BLASLONG width = (n + 3 - 1) / 3;
65+
66+
FLOAT *a0_ptr = a + lda * width * 0;
67+
FLOAT *a1_ptr = a + lda * width * 1;
68+
FLOAT *a2_ptr = a + lda * width * 2;
69+
70+
FLOAT *y0_ptr = y + inc_y * width * 0;
71+
FLOAT *y1_ptr = y + inc_y * width * 1;
72+
FLOAT *y2_ptr = y + inc_y * width * 2;
73+
74+
for (j = 0; j < width; j++) {
75+
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
76+
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
77+
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
78+
79+
SV_TYPE temp00_vec = SV_DUP(0.0);
80+
SV_TYPE temp01_vec = SV_DUP(0.0);
81+
SV_TYPE temp02_vec = SV_DUP(0.0);
82+
83+
i = 0;
84+
BLASLONG sve_size = SV_COUNT();
85+
while ((i + sve_size * 1 - 1) < m) {
86+
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
87+
88+
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
89+
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
90+
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
91+
92+
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
93+
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
94+
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
95+
96+
i += sve_size * 1;
97+
}
98+
99+
if (i < m) {
100+
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);
101+
102+
pg00 = svand_z(SV_TRUE(), pg0, pg00);
103+
pg01 = svand_z(SV_TRUE(), pg0, pg01);
104+
pg02 = svand_z(SV_TRUE(), pg0, pg02);
105+
106+
SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
107+
108+
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
109+
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
110+
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
111+
112+
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
113+
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
114+
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
115+
}
116+
117+
if ((j + width * 0) < n) {
118+
temp = svaddv(SV_TRUE(), temp00_vec);
119+
y0_ptr[iy] += alpha * temp;
120+
}
121+
if ((j + width * 1) < n) {
122+
temp = svaddv(SV_TRUE(), temp01_vec);
123+
y1_ptr[iy] += alpha * temp;
124+
}
125+
if ((j + width * 2) < n) {
126+
temp = svaddv(SV_TRUE(), temp02_vec);
127+
y2_ptr[iy] += alpha * temp;
128+
}
129+
iy += inc_y;
130+
131+
a0_ptr += lda;
132+
a1_ptr += lda;
133+
a2_ptr += lda;
134+
}
135+
136+
return(0);
137+
}
138+
139+
a_ptr = a;
140+
for (j = 0; j < n; j++) {
141+
temp = 0.0;
142+
ix = 0;
143+
for (i = 0; i < m; i++) {
144+
temp += a_ptr[i] * x[ix];
145+
ix += inc_x;
146+
}
147+
y[iy] += alpha * temp;
148+
iy += inc_y;
149+
a_ptr += lda;
150+
}
151+
return(0);
152+
}

0 commit comments

Comments
 (0)