diff --git a/.travis.yml b/.travis.yml
index 6cb75cf877..d2a1fb842e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -86,6 +86,11 @@ matrix:
env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \
CC=riscv32-unknown-linux-gnu-gcc \
LDFLAGS=-static
+ - os: linux
+ compiler: clang
+ env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="sifive_x280" \
+ CC=clang \
+ LDFLAGS=-static
install:
- if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
- if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
@@ -106,6 +111,12 @@ script:
export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
fi
+- if [ "$CONF" = "sifive_x280" ]; then
+ $DIST_PATH/travis/do_riscv.sh "$CONF";
+ export CC=$DIST_PATH/../toolchain/riscv/bin/clang;
+ export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++;
+ export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
+ fi
- $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF
- pwd
- ls -l
diff --git a/CREDITS b/CREDITS
index b81ca66526..03953b1a19 100644
--- a/CREDITS
+++ b/CREDITS
@@ -17,11 +17,13 @@ but many others have contributed code, ideas, and feedback, including
Alex Arslan @ararslan
Vernon Austel (IBM, T.J. Watson Research Center)
Mohsen Aznaveh @Aznaveh (Texas A&M University)
+ Abhishek Bagusetty @abagusetty (Argonne National Laboratory)
Satish Balay @balay (Argonne National Laboratory)
Kihiro Bando @bandokihiro
Matthew Brett @matthew-brett (University of Birmingham)
Jérémie du Boisberranger @jeremiedbb
Jed Brown @jedbrown (Argonne National Laboratory)
+ Alex Chiang @alexsifivetw (SiFive)
Robin Christ @robinchrist
Dilyn Corner @dilyn-corner
Mat Cross @matcross (NAG)
@@ -37,12 +39,14 @@ but many others have contributed code, ideas, and feedback, including
Victor Eijkhout @VictorEijkhout (Texas Advanced Computing Center)
Evgeny Epifanovsky @epifanovsky (Q-Chem)
Isuru Fernando @isuruf
+ James Foster @jd-foster (CSIRO)
Roman Gareev @gareevroman
Richard Goldschmidt @SuperFluffy
Chris Goodyer
Alexander Grund @Flamefire
John Gunnels @jagunnels (IBM, T.J. Watson Research Center)
Ali Emre Gülcü @Lephar
+ @h-vetinari
Jeff Hammond @jeffhammond (Intel)
Jacob Gorm Hansen @jacobgorm
Shivaprashanth H (Global Edge)
@@ -52,7 +56,9 @@ but many others have contributed code, ideas, and feedback, including
Minh Quan Ho @hominhquan
Matthew Honnibal @honnibal
Stefan Husmann @stefanhusmann
+ Aaron Hutchinson @Aaron-Hutchinson (SiFive)
Francisco Igual @figual (Universidad Complutense de Madrid)
+ John Mather @jmather-sesi (SideFX Software)
Madeesh Kannan @shadeMe
Tony Kelman @tkelman
Lee Killough @leekillough (Tactical Computing Labs)
@@ -125,12 +131,12 @@ but many others have contributed code, ideas, and feedback, including
Meghana Vankadari @Meghana-vankadari (AMD)
Kiran Varaganti @kvaragan (AMD)
Natalia Vassilieva (Hewlett Packard Enterprise)
- @h-vetinari
Andrew Wildman @awild82 (University of Washington)
Zhang Xianyi @xianyi (Chinese Academy of Sciences)
Benda Xu @heroxbd
Guodong Xu @docularxu (Linaro.org)
RuQing Xu @xrq-phys (The University of Tokyo)
+ Srinivas Yadav @srinivasyadav18
Costas Yamin @cosstas
Chenhan Yu @ChenhanYu (The University of Texas at Austin)
Roman Yurchak @rth (Symerio)
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index d3871d8f77..a61d1b95d4 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -69,11 +69,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 4 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 4 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 1024, 0, 768 );
- bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 2048, 0, 1536 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 10240, 0, 10240 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 8, -1, 4 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, 4 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 1024, -1, 768 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 2048, -1, 1536 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 10240, -1, 10240 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index 6af3ff91ce..55a8000e74 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -69,11 +69,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 8f615588c6..bbaf37541b 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -67,13 +67,13 @@ void bli_cntx_init_knc( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 30, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0,
- 0, 160, 0, 0 );
- bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0,
- 0, 300, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 30, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1,
+ -1, 160, -1, -1 );
+ bli_blksz_init ( &blkszs[ BLIS_KC ], -1, 240, -1, -1,
+ -1, 300, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 14400, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 964438e834..30b3ac9fa4 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -77,11 +77,11 @@ void bli_cntx_init_penryn( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index d5ffe7dcfa..9d1de3da5c 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -67,11 +67,11 @@ void bli_cntx_init_power7( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 8, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 64, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
new file mode 100644
index 0000000000..197394c822
--- /dev/null
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -0,0 +1,226 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_sifive_x280( cntx_t* cntx )
+{
+ blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+ // Set default kernel blocksizes and functions.
+ bli_cntx_init_sifive_x280_ref( cntx );
+
+ // -------------------------------------------------------------------------
+
+ // Update the context with optimized native kernels.
+ bli_cntx_set_ukrs
+ (
+ cntx,
+
+ // Level 1
+ BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_x280_intr,
+ BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_x280_intr,
+ BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr,
+ BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr,
+
+ BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_asm,
+ BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_asm,
+ BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm,
+ BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm,
+
+ BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_x280_intr,
+ BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_x280_intr,
+ BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr,
+ BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr,
+
+ BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_x280_intr,
+ BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_x280_intr,
+ BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr,
+ BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr,
+
+ BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_asm,
+ BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_asm,
+ BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm,
+ BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm,
+
+ BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_x280_intr,
+ BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_x280_intr,
+ BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr,
+ BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr,
+
+ BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_x280_intr,
+ BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_x280_intr,
+ BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr,
+ BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr,
+
+ BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_asm,
+ BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_asm,
+ BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm,
+ BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm,
+
+ BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_asm,
+ BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_asm,
+ BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm,
+ BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm,
+
+ BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_x280_intr,
+ BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_x280_intr,
+ BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr,
+ BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr,
+
+ BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_x280_intr,
+ BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_x280_intr,
+ BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr,
+ BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr,
+
+ BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_asm,
+ BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_asm,
+ BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm,
+ BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm,
+
+ BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_x280_intr,
+ BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_x280_intr,
+ BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr,
+ BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr,
+
+ BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_asm,
+ BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_asm,
+ BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm,
+ BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm,
+
+ BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_x280_intr,
+ BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_x280_intr,
+ BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr,
+ BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr,
+
+ // Level 1f
+ BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_x280_intr,
+ BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_x280_intr,
+ BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr,
+ BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr,
+
+ BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_asm,
+ BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_asm,
+ BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm,
+ BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm,
+
+ BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_asm,
+ BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_asm,
+ BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm,
+ BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm,
+
+ BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_x280_intr,
+ BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_x280_intr,
+ BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr,
+ BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr,
+
+ BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_asm,
+ BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_asm,
+ BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm,
+ BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm,
+
+ // Level 1m
+ BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_7xk,
+ BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_7xk,
+ BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6xk,
+ BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6xk,
+ BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_64xk,
+ BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_32xk,
+ BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_32xk,
+ BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_16xk,
+
+ // Level 3
+ BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_asm_7m4,
+ BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_asm_7m4,
+ BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2,
+ BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2,
+
+ BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_asm,
+ BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_asm,
+ BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm,
+ BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm,
+ BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_asm,
+ BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_asm,
+ BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm,
+ BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm,
+
+ BLIS_VA_END
+ );
+
+ // Update the context with storage preferences.
+ bli_cntx_set_ukr_prefs
+ (
+ cntx,
+
+ BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+ BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+ BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+ BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+
+ BLIS_VA_END
+ );
+
+ // Initialize level-3 blocksize objects with architecture-specific values.
+ // s d c z
+ bli_blksz_init ( &blkszs[ BLIS_MR ], 7, 7, 6, 6,
+ 8, 8, 8, 8 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 32, 32, 16 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], 28, 28, 24, 24 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], 1024, 1024, 1024, 1024 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 128, 256, 128 );
+ // Default BLIS_BBM_s = 1, but set here to ensure it's correct
+ bli_blksz_init_easy( &blkszs[ BLIS_BBM ], 1, 1, 1, 1 );
+ bli_blksz_init_easy( &blkszs[ BLIS_BBN ], 1, 1, 1, 1 );
+
+ // Update the context with the current architecture's register and cache
+ // blocksizes (and multiples) for native execution.
+ bli_cntx_set_blkszs
+ (
+ cntx,
+
+ // level-3
+ BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+ BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+ BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+ BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+ BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+ // level-1m
+ BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+ BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+
+ BLIS_VA_END
+ );
+}
+
diff --git a/config/sifive_x280/bli_family_sifive_x280.h b/config/sifive_x280/bli_family_sifive_x280.h
new file mode 100644
index 0000000000..4f02c048fa
--- /dev/null
+++ b/config/sifive_x280/bli_family_sifive_x280.h
@@ -0,0 +1,34 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
diff --git a/config/sifive_x280/bli_kernel_defs_sifive_x280.h b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
new file mode 100644
index 0000000000..bb6865a669
--- /dev/null
+++ b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
@@ -0,0 +1,55 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+#define BLIS_MR_s 7
+#define BLIS_MR_d 7
+#define BLIS_MR_c 6
+#define BLIS_MR_z 6
+
+#define BLIS_PACKMR_s 8
+#define BLIS_PACKMR_d 8
+#define BLIS_PACKMR_c 8
+#define BLIS_PACKMR_z 8
+
+#define BLIS_NR_s 64
+#define BLIS_NR_d 32
+#define BLIS_NR_c 32
+#define BLIS_NR_z 16
+//#endif
+
diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk
new file mode 100644
index 0000000000..acdf5a3611
--- /dev/null
+++ b/config/sifive_x280/make_defs.mk
@@ -0,0 +1,78 @@
+#
+#
+# BLIS
+# An object-based framework for developing high-performance BLAS-like
+# libraries.
+#
+# Copyright (C) 2023, SiFive, Inc.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# - Neither the name(s) of the copyright holder(s) nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG := sifive_x280
+#CONFIGS_INCL += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d
+CPPROCFLAGS :=
+CMISCFLAGS := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \
+ -fdiagnostics-color=always -fno-rtti -fno-exceptions
+CPICFLAGS := -fPIC
+CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \
+ -Wno-sign-compare -Wno-unused-variable
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS := -O0
+else
+COPTFLAGS := -Ofast
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS := $(COPTFLAGS)
+CKVECFLAGS :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS := $(CKOPTFLAGS)
+CRVECFLAGS := $(CKVECFLAGS)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index 4bacc5d63c..8e5a57d6cf 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -87,11 +87,11 @@ void bli_cntx_init_template( cntx_t* cntx )
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
- bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 0, 0, 4 );
- bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 0, 0, 4 );
- bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 0, 0, 128 );
- bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 0, 0, 256 );
- bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 0, 0, 4096 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, -1, -1, 4 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, -1, -1, 4 );
+ bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, -1, -1, 128 );
+ bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, -1, -1, 256 );
+ bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, -1, -1, 4096 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 88f39c3d13..0bd4ed3441 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -1,6 +1,6 @@
#
#
-# BLIS
+# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
@@ -35,7 +35,7 @@
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
-THIS_CONFIG := zen3
+THIS_CONFIG := zen3
#CONFIGS_INCL += $(THIS_CONFIG)
#
@@ -65,8 +65,8 @@ endif
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer
CROPTFLAGS := $(CKOPTFLAGS)
-CKVECFLAGS := -mavx2 -mfma -mfpmath=sse
-CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+CKVECFLAGS := -mavx2 -mfma
+CRVECFLAGS := $(CKVECFLAGS)
ifeq ($(CC_VENDOR),gcc)
ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1.
CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store
@@ -77,6 +77,8 @@ ifeq ($(CC_VENDOR),gcc)
CVECFLAGS_VER := -march=znver3
endif
endif
+ CKVECFLAGS += -mfpmath=sse
+ CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0.
@@ -92,6 +94,8 @@ ifeq ($(CC_VENDOR),clang)
endif
endif
endif
+ CKVECFLAGS += -mfpmath=sse
+ CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),aocc)
ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0.
@@ -103,8 +107,14 @@ ifeq ($(CC_VENDOR),aocc)
CVECFLAGS_VER := -march=znver3
endif
endif
+ CKVECFLAGS += -mfpmath=sse
+ CRVECFLAGS += -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),nvc)
+ CVECFLAGS_VER := -march=znver3
+ CRVECFLAGS += -fast
else
- $(error gcc, clang, or aocc is required for this configuration.)
+ $(error gcc, clang, nvc or aocc is required for this configuration.)
+endif
endif
endif
endif
@@ -114,4 +124,3 @@ CRVECFLAGS += $(CVECFLAGS_VER)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
-
diff --git a/config_registry b/config_registry
index 61482ae7b8..09a33bc9a5 100644
--- a/config_registry
+++ b/config_registry
@@ -59,5 +59,8 @@ rv64i: rv64i/rvi
rv32iv: rv32iv/rviv
rv64iv: rv64iv/rviv
+# SiFive architectures.
+sifive_x280: sifive_x280
+
# Generic architectures.
generic: generic
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 6f2ef49c55..d8f8b13f40 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -101,7 +101,7 @@ The `cores` value is most appropriate for BLIS since we usually want to ignore h
Setting these two variables is often enough. However, it obviously does not offer the level of control that `GOMP_CPU_AFFINITY` does. Sometimes, it takes some experimentation to determine whether a particular mapping is performing as expected. If multithreaded performance on eight cores is only twice what it is observed of single-threaded performance, the affinity mapping may be to blame. But if performance is six or seven times higher than sequential execution, then the mapping you chose is probably working fine.
-Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity/) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
+Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](https://web.archive.org/web/20190130102805/http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
# Specifying multithreading
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index 7f74010451..f38710ea84 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -37,7 +37,7 @@
BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
(
- pool_t* pool,
+ pool_t* sba_pool,
void_fp var_func,
bszid_t bmid_m,
bszid_t bmid_n,
@@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
#endif
// Allocate a packm_params_t struct.
- params = bli_sba_acquire( pool, sizeof( packm_params_t ) );
+ params = bli_sba_acquire( sba_pool, sizeof( packm_params_t ) );
// Initialize the packm_params_t struct.
params->size = sizeof( packm_params_t );
@@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
// sync with the cntl_t tree.
cntl = bli_cntl_create_node
(
- pool,
+ sba_pool,
BLIS_NOID,
BLIS_NO_PART,
var_func,
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 8a43f711d1..a94a465b25 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
cntl_t* bli_packm_cntl_create_node
(
- pool_t* pool,
+ pool_t* sba_pool,
void_fp var_func,
bszid_t bmid_m,
bszid_t bmid_n,
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index 88ec5def91..dc1d3bb1ba 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -89,7 +89,7 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
// Create a default control tree for the operation, if needed.
cntl_t* cntl_use;
- pool_t* sba_pool = bli_apool_array_elem( tid, array );
+ pool_t* sba_pool = bli_sba_array_elem( tid, array );
bli_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use );
diff --git a/frame/3/bli_l3_sup_decor.c b/frame/3/bli_l3_sup_decor.c
index 7cda8bdcaa..d420559b58 100644
--- a/frame/3/bli_l3_sup_decor.c
+++ b/frame/3/bli_l3_sup_decor.c
@@ -69,7 +69,7 @@ static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, co
bli_l3_thread_decorator_thread_check( gl_comm, rntm );
// Create the root node of the thread's thrinfo_t structure.
- pool_t* pool = bli_apool_array_elem( tid, array );
+ pool_t* pool = bli_sba_array_elem( tid, array );
thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
func
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 95d2a54398..5f3d39d391 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -44,16 +44,14 @@ thrinfo_t* bli_l3_thrinfo_create
const cntl_t* cntl
)
{
- pool_t* pool = NULL;
- if ( array != NULL )
- pool = bli_apool_array_elem( id, array );
+ pool_t* sba_pool = bli_sba_array_elem( id, array );
// Create the root thrinfo_t node.
thrinfo_t* root = bli_thrinfo_create_root
(
gl_comm,
id,
- pool,
+ sba_pool,
bli_pba_query()
);
@@ -123,7 +121,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
(
dim_t id,
thrcomm_t* gl_comm,
- pool_t* pool,
+ pool_t* sba_pool,
const rntm_t* rntm
)
{
@@ -132,7 +130,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
(
gl_comm,
id,
- pool,
+ sba_pool,
bli_pba_query()
);
@@ -176,10 +174,10 @@ void bli_l3_sup_thrinfo_update
thrinfo_t** root
)
{
- thrcomm_t* gl_comm = bli_thrinfo_comm( *root );
- dim_t tid = bli_thrinfo_thread_id( *root );
- pool_t* pool = bli_thrinfo_sba_pool( *root );
- dim_t nt = bli_thrinfo_num_threads( *root );
+ thrcomm_t* gl_comm = bli_thrinfo_comm( *root );
+ dim_t tid = bli_thrinfo_thread_id( *root );
+ pool_t* sba_pool = bli_thrinfo_sba_pool( *root );
+ dim_t nt = bli_thrinfo_num_threads( *root );
// Return early in single-threaded execution
// since the thread control tree may not have been
@@ -187,7 +185,7 @@ void bli_l3_sup_thrinfo_update
if ( nt == 1 ) return;
bli_thrinfo_free( *root );
- *root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
+ *root = bli_l3_sup_thrinfo_create( tid, gl_comm, sba_pool, rntm );
}
// -----------------------------------------------------------------------------
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index d06f79207b..c11171a27c 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -56,7 +56,7 @@ BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool )
return &(apool->pool);
}
-BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
+BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
{
return &(apool->mutex);
}
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index af8f671859..a53a2fb64c 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -286,6 +286,11 @@ arch_t bli_arch_query_id_impl( void )
id = BLIS_ARCH_RV64IV;
#endif
+ // SiFive microarchitectures.
+ #ifdef BLIS_FAMILY_SIFIVE_X280
+ id = BLIS_ARCH_SIFIVE_X280;
+ #endif
+
// Generic microarchitecture.
#ifdef BLIS_FAMILY_GENERIC
id = BLIS_ARCH_GENERIC;
@@ -351,6 +356,8 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
"rv32iv",
"rv64iv",
+ "sifive_x280",
+
"generic"
};
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index d91c0542d8..7f1db27066 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -84,14 +84,15 @@ BLIS_INLINE void bli_blksz_copy
*b_dst = *b_src;
}
-BLIS_INLINE void bli_blksz_copy_if_pos
+BLIS_INLINE void bli_blksz_copy_if_nonneg
(
const blksz_t* b_src,
blksz_t* b_dst
)
{
- // Copy the blocksize values over to b_dst one-by-one so that
- // we can skip the ones that are non-positive.
+ // Copy the blocksize values over to b_dst one-by-one. Note that we
+ // only copy valuse that are zero or positive (and skip copying any
+ // values that are negative).
const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src );
const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src );
@@ -103,15 +104,15 @@ BLIS_INLINE void bli_blksz_copy_if_pos
const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src );
const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src );
- if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst );
- if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst );
- if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
- if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
+ if ( v_s >= 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst );
+ if ( v_d >= 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst );
+ if ( v_c >= 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
+ if ( v_z >= 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
- if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst );
- if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst );
- if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
- if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
+ if ( e_s >= 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst );
+ if ( e_d >= 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst );
+ if ( e_c >= 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
+ if ( e_z >= 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
}
BLIS_INLINE void bli_blksz_copy_def_dt
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index daa092ba72..bd688f85ad 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -37,7 +37,7 @@
cntl_t* bli_cntl_create_node
(
- pool_t* pool,
+ pool_t* sba_pool,
opid_t family,
bszid_t bszid,
void_fp var_func,
@@ -52,7 +52,7 @@ cntl_t* bli_cntl_create_node
#endif
// Allocate the cntl_t struct.
- cntl = bli_sba_acquire( pool, sizeof( cntl_t ) );
+ cntl = bli_sba_acquire( sba_pool, sizeof( cntl_t ) );
bli_cntl_set_family( family, cntl );
bli_cntl_set_bszid( bszid, cntl );
@@ -66,7 +66,7 @@ cntl_t* bli_cntl_create_node
void bli_cntl_free_node
(
- pool_t* pool,
+ pool_t* sba_pool,
cntl_t* cntl
)
{
@@ -74,7 +74,7 @@ void bli_cntl_free_node
printf( "bli_cntl_free_node(): " );
#endif
- bli_sba_release( pool, cntl );
+ bli_sba_release( sba_pool, cntl );
}
void bli_cntl_clear_node
@@ -94,7 +94,7 @@ void bli_cntl_clear_node
void bli_cntl_free
(
- pool_t* pool,
+ pool_t* sba_pool,
cntl_t* cntl
)
{
@@ -110,7 +110,7 @@ void bli_cntl_free
{
// Recursively free all memory associated with the sub-prenode and its
// children.
- bli_cntl_free( pool, cntl_sub_prenode );
+ bli_cntl_free( sba_pool, cntl_sub_prenode );
}
// Only recurse into the child node if it exists.
@@ -118,7 +118,7 @@ void bli_cntl_free
{
// Recursively free all memory associated with the sub-node and its
// children.
- bli_cntl_free( pool, cntl_sub_node );
+ bli_cntl_free( sba_pool, cntl_sub_node );
}
// Free the current node's params field, if it is non-NULL.
@@ -128,18 +128,18 @@ void bli_cntl_free
printf( "bli_cntl_free_w_thrinfo(): " );
#endif
- bli_sba_release( pool, cntl_params );
+ bli_sba_release( sba_pool, cntl_params );
}
// Free the current node.
- bli_cntl_free_node( pool, cntl );
+ bli_cntl_free_node( sba_pool, cntl );
}
// -----------------------------------------------------------------------------
cntl_t* bli_cntl_copy
(
- pool_t* pool,
+ pool_t* sba_pool,
const cntl_t* cntl
)
{
@@ -149,7 +149,7 @@ cntl_t* bli_cntl_copy
// field.
cntl_t* cntl_copy = bli_cntl_create_node
(
- pool,
+ sba_pool,
bli_cntl_family( cntl ),
bli_cntl_bszid( cntl ),
bli_cntl_var_func( cntl ),
@@ -165,7 +165,7 @@ cntl_t* bli_cntl_copy
// struct.
uint64_t params_size = bli_cntl_params_size( cntl );
void* params_orig = bli_cntl_params( cntl );
- void* params_copy = bli_sba_acquire( pool, ( size_t )params_size );
+ void* params_copy = bli_sba_acquire( sba_pool, ( size_t )params_size );
// Copy the original params struct to the new memory region.
memcpy( params_copy, params_orig, params_size );
@@ -180,7 +180,7 @@ cntl_t* bli_cntl_copy
{
cntl_t* sub_prenode_copy = bli_cntl_copy
(
- pool,
+ sba_pool,
bli_cntl_sub_prenode( cntl )
);
@@ -194,7 +194,7 @@ cntl_t* bli_cntl_copy
{
cntl_t* sub_node_copy = bli_cntl_copy
(
- pool,
+ sba_pool,
bli_cntl_sub_node( cntl )
);
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 8c6cafc13c..4635c11f4a 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -100,7 +100,7 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
//cntx_blkszs[ bs_id ] = *blksz;
//bli_blksz_copy( blksz, cntx_blksz );
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
- bli_blksz_copy_if_pos( blksz, cntx_blksz );
+ bli_blksz_copy_if_nonneg( blksz, cntx_blksz );
// Copy the blocksize multiple id into the context.
cntx_bmults[ bs_id ] = bm_id;
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 7b9ab3d7c2..a21aa12446 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -259,6 +259,14 @@ int bli_gks_init( void )
bli_cntx_init_rv64iv_ind );
#endif
+ // -- SiFive architectures ----------------------------------------------
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+ bli_gks_register_cntx( BLIS_ARCH_SIFIVE_X280, bli_cntx_init_sifive_x280,
+ bli_cntx_init_sifive_x280_ref,
+ bli_cntx_init_sifive_x280_ind );
+#endif
+
// -- Generic architectures --------------------------------------------
#ifdef BLIS_CONFIG_GENERIC
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 5123c5b4b2..54da4c7d91 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -47,17 +47,21 @@ apool_t* bli_sba_query( void )
void bli_sba_init( void )
{
+#ifdef BLIS_ENABLE_SBA_POOLS
bli_apool_init( &sba );
+#endif
}
void bli_sba_finalize( void )
{
+#ifdef BLIS_ENABLE_SBA_POOLS
bli_apool_finalize( &sba );
+#endif
}
void* bli_sba_acquire
(
- pool_t* pool,
+ pool_t* sba_pool,
siz_t req_size
)
{
@@ -74,7 +78,7 @@ void* bli_sba_acquire
// is convenient to not have to checkout an array_t from the sba, and it
// does no harm since the malloc() happens outside of the region that
// would be timed.)
- if ( pool == NULL )
+ if ( sba_pool == NULL )
{
block = bli_malloc_intl( req_size, &r_val );
}
@@ -84,10 +88,10 @@ void* bli_sba_acquire
// Query the block_size of the pool_t so that we can request the exact
// size present.
- const siz_t block_size = bli_pool_block_size( pool );
+ const siz_t block_size = bli_pool_block_size( sba_pool );
// Sanity check: Make sure the requested size is no larger than the
- // block_size field of the pool.
+ // block_size field of the sba pool.
if ( block_size < req_size )
{
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
@@ -96,7 +100,7 @@ void* bli_sba_acquire
}
// Check out a block using the block_size queried above.
- bli_pool_checkout_block( block_size, &pblk, pool );
+ bli_pool_checkout_block( block_size, &pblk, sba_pool );
// The block address is stored within the pblk_t.
block = bli_pblk_buf( &pblk );
@@ -114,13 +118,13 @@ void* bli_sba_acquire
void bli_sba_release
(
- pool_t* pool,
+ pool_t* sba_pool,
void* block
)
{
#ifdef BLIS_ENABLE_SBA_POOLS
- if ( pool == NULL )
+ if ( sba_pool == NULL )
{
bli_free_intl( block );
}
@@ -132,17 +136,17 @@ void bli_sba_release
// for this particular application of the pool_t (that is, the "leaf"
// component of the sba), but it seems like good housekeeping to maintain
// the block_size field of the pblk_t in case its ever needed/read.
- const siz_t block_size = bli_pool_block_size( pool );
+ const siz_t block_size = bli_pool_block_size( sba_pool );
// Embed the block's memory address into a pblk_t, along with the
- // block_size queried from the pool.
+ // block_size queried from the sba pool.
bli_pblk_set_buf( block, &pblk );
bli_pblk_set_block_size( block_size, &pblk );
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
// a local variable since its contents are copied into the pool's internal
// data structure--an array of pblk_t.)
- bli_pool_checkin_block( &pblk, pool );
+ bli_pool_checkin_block( &pblk, sba_pool );
}
#else
@@ -176,3 +180,17 @@ void bli_sba_checkin_array
#endif
}
+pool_t* bli_sba_array_elem
+ (
+ siz_t index,
+ array_t* array
+ )
+{
+#ifdef BLIS_ENABLE_SBA_POOLS
+ if ( array != NULL ) return bli_apool_array_elem( index, array );
+ else return NULL;
+#else
+ return NULL;
+#endif
+}
+
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index 8d9db844f9..92e53e7b30 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -42,6 +42,18 @@ apool_t* bli_sba_query( void );
void bli_sba_init( void );
void bli_sba_finalize( void );
+void* bli_sba_acquire
+ (
+ pool_t* sba_pool,
+ siz_t req_size
+ );
+
+void bli_sba_release
+ (
+ pool_t* sba_pool,
+ void* block
+ );
+
array_t* bli_sba_checkout_array
(
siz_t n_threads
@@ -52,16 +64,10 @@ void bli_sba_checkin_array
array_t* array
);
-void* bli_sba_acquire
+pool_t* bli_sba_array_elem
(
- pool_t* pool,
- siz_t req_size
- );
-
-void bli_sba_release
- (
- pool_t* pool,
- void* block
+ siz_t index,
+ array_t* array
);
#endif
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index c5b5ebda37..8923acdc48 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -38,8 +38,8 @@
//
// Define BLAS-to-BLIS interfaces.
//
-#undef GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
@@ -110,6 +110,6 @@ void PASTEF77(ch,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( symv, symv )
+INSERT_GENTFUNC_BLAS( symv, symv )
#endif
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 2f493a9d97..4f453a7a32 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -37,8 +37,8 @@
//
// Prototype BLAS-to-BLIS interfaces.
//
-#undef GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
@@ -52,7 +52,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
);
#ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( symv )
+INSERT_GENTPROT_BLAS( symv )
#endif
#endif
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 6732a75cf2..91dc99b599 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -38,8 +38,8 @@
//
// Define BLAS-to-BLIS interfaces.
//
-#undef GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
@@ -101,6 +101,6 @@ void PASTEF77(ch,blasname) \
}
#ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( syr, syr )
+INSERT_GENTFUNC_BLAS( syr, syr )
#endif
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 662d07328f..7f3eeb3679 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -37,8 +37,8 @@
//
// Prototype BLAS-to-BLIS interfaces.
//
-#undef GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
@@ -50,7 +50,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
);
#ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( syr )
+INSERT_GENTPROT_BLAS( syr )
#endif
#endif
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index c79769bc05..0dbd720d21 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
+ Copyright (C) 2023, Field G. Van Zee
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -358,5 +359,484 @@
return 0;
} /* zdrot_ */
+
+/* crot.f -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s)
+{
+ /* System generated locals */
+ bla_integer i__1, i__2, i__3, i__4;
+ bla_scomplex q__1, q__2, q__3, q__4;
+
+ /* Local variables */
+ bla_integer i__, ix, iy;
+ bla_scomplex stemp;
+
+
+ /* Parameter adjustments */
+ --cy;
+ --cx;
+
+ /* Function Body */
+ if (*n <= 0) {
+ return 0;
+ }
+ if (*incx == 1 && *incy == 1) {
+ goto L20;
+ }
+
+/* Code for unequal increments or equal increments not equal to 1 */
+
+ ix = 1;
+ iy = 1;
+ if (*incx < 0) {
+ ix = (-(*n) + 1) * *incx + 1;
+ }
+ if (*incy < 0) {
+ iy = (-(*n) + 1) * *incy + 1;
+ }
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = ix;
+#if 0
+ q__2.r = *c__ * cx[i__2].r;
+ q__2.i = *c__ * cx[i__2].i;
+ i__3 = iy;
+ q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+ q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+ q__1.r = q__2.r + q__3.r
+ q__1.i = q__2.i + q__3.i;
+ stemp.r = q__1.r
+ stemp.i = q__1.i;
+ i__2 = iy;
+ i__3 = iy;
+ q__2.r = *c__ * cy[i__3].r
+ q__2.i = *c__ * cy[i__3].i;
+ r_cnjg(&q__4, s);
+ i__4 = ix;
+ q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i
+ q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+ q__1.r = q__2.r - q__3.r;
+ q__1.i = q__2.i - q__3.i;
+ cy[i__2].r = q__1.r;
+ cy[i__2].i = q__1.i;
+ i__2 = ix;
+ cx[i__2].r = stemp.r;
+ cx[i__2].i = stemp.i;
+#else
+ bli_csets
+ (
+ *c__ * bli_creal(cx[i__2]),
+ *c__ * bli_cimag(cx[i__2]),
+ q__2
+ );
+ i__3 = iy;
+ bli_csets
+ (
+ bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+ bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+ q__3
+ );
+ bli_csets
+ (
+ bli_creal(q__2) + bli_creal(q__3),
+ bli_cimag(q__2) + bli_cimag(q__3),
+ q__1
+ );
+ bli_csets
+ (
+ bli_creal(q__1),
+ bli_cimag(q__1),
+ stemp
+ );
+ i__2 = iy;
+ i__3 = iy;
+ bli_csets
+ (
+ *c__ * bli_creal(cy[i__3]),
+ *c__ * bli_cimag(cy[i__3]),
+ q__2
+ );
+ bla_r_cnjg(&q__4, s);
+ i__4 = ix;
+ bli_csets
+ (
+ bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+ bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+ q__3
+ );
+ bli_csets
+ (
+ bli_creal(q__2) - bli_creal(q__3),
+ bli_cimag(q__2) - bli_cimag(q__3),
+ q__1
+ );
+ bli_csets
+ (
+ bli_creal(q__1),
+ bli_cimag(q__1),
+ cy[i__2]
+ );
+ i__2 = ix;
+ bli_csets
+ (
+ bli_creal(stemp),
+ bli_cimag(stemp),
+ cx[i__2]
+ );
+#endif
+ ix += *incx;
+ iy += *incy;
+/* L10: */
+ }
+ return 0;
+
+/* Code for both increments equal to 1 */
+
+L20:
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = i__;
+#if 0
+ q__2.r = *c__ * cx[i__2].r;
+ q__2.i = *c__ * cx[i__2].i;
+ i__3 = i__;
+ q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+ q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+ q__1.r = q__2.r + q__3.r;
+ q__1.i = q__2.i + q__3.i;
+ stemp.r = q__1.r;
+ stemp.i = q__1.i;
+ i__2 = i__;
+ i__3 = i__;
+ q__2.r = *c__ * cy[i__3].r;
+ q__2.i = *c__ * cy[i__3].i;
+ bla_r_cnjg(&q__4, s);
+ i__4 = i__;
+ q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i;
+ q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+ q__1.r = q__2.r - q__3.r;
+ q__1.i = q__2.i - q__3.i;
+ cy[i__2].r = q__1.r;
+ cy[i__2].i = q__1.i;
+ i__2 = i__;
+ cx[i__2].r = stemp.r;
+ cx[i__2].i = stemp.i;
+#else
+ bli_csets
+ (
+ *c__ * bli_creal(cx[i__2]),
+ *c__ * bli_cimag(cx[i__2]),
+ q__2
+ );
+ i__3 = i__;
+ bli_csets
+ (
+ bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+ bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+ q__3
+ );
+ bli_csets
+ (
+ bli_creal(q__2) + bli_creal(q__3),
+ bli_cimag(q__2) + bli_cimag(q__3),
+ q__1
+ );
+ bli_csets
+ (
+ bli_creal(q__1),
+ bli_cimag(q__1),
+ stemp
+ );
+ i__2 = i__;
+ i__3 = i__;
+ bli_csets
+ (
+ *c__ * bli_creal(cy[i__3]),
+ *c__ * bli_cimag(cy[i__3]),
+ q__2
+ );
+ bla_r_cnjg(&q__4, s);
+ i__4 = i__;
+ bli_csets
+ (
+ bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+ bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+ q__3
+ );
+ bli_csets
+ (
+ bli_creal(q__2) - bli_creal(q__3),
+ bli_cimag(q__2) - bli_cimag(q__3),
+ q__1
+ );
+ bli_csets
+ (
+ bli_creal(q__1),
+ bli_cimag(q__1),
+ cy[i__2]
+ );
+ i__2 = i__;
+ bli_csets
+ (
+ bli_creal(stemp),
+ bli_cimag(stemp),
+ cx[i__2]
+ );
+#endif
+/* L30: */
+ }
+ return 0;
+} /* crot_ */
+
+
+/* zrot.f -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s)
+{
+ /* System generated locals */
+ bla_integer i__1, i__2, i__3, i__4;
+ bla_dcomplex z__1, z__2, z__3, z__4;
+
+ /* Local variables */
+ bla_integer i__, ix, iy;
+ bla_dcomplex stemp;
+
+
+ /* Parameter adjustments */
+ --cy;
+ --cx;
+
+ /* Function Body */
+ if (*n <= 0) {
+ return 0;
+ }
+ if (*incx == 1 && *incy == 1) {
+ goto L20;
+ }
+
+/* Code for unequal increments or equal increments not equal to 1 */
+
+ ix = 1;
+ iy = 1;
+ if (*incx < 0) {
+ ix = (-(*n) + 1) * *incx + 1;
+ }
+ if (*incy < 0) {
+ iy = (-(*n) + 1) * *incy + 1;
+ }
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = ix;
+#if 0
+ z__2.r = *c__ * cx[i__2].r;
+ z__2.i = *c__ * cx[i__2].i;
+ i__3 = iy;
+ z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+ z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+ z__1.r = z__2.r + z__3.r
+ z__1.i = z__2.i + z__3.i;
+ stemp.r = z__1.r
+ stemp.i = z__1.i;
+ i__2 = iy;
+ i__3 = iy;
+ z__2.r = *c__ * cy[i__3].r
+ z__2.i = *c__ * cy[i__3].i;
+ r_cnjg(&z__4, s);
+ i__4 = ix;
+ z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i
+ z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+ z__1.r = z__2.r - z__3.r;
+ z__1.i = z__2.i - z__3.i;
+ cy[i__2].r = z__1.r;
+ cy[i__2].i = z__1.i;
+ i__2 = ix;
+ cx[i__2].r = stemp.r;
+ cx[i__2].i = stemp.i;
+#else
+ bli_zsets
+ (
+ *c__ * bli_zreal(cx[i__2]),
+ *c__ * bli_zimag(cx[i__2]),
+ z__2
+ );
+ i__3 = iy;
+ bli_zsets
+ (
+ bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+ bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+ z__3
+ );
+ bli_zsets
+ (
+ bli_zreal(z__2) + bli_zreal(z__3),
+ bli_zimag(z__2) + bli_zimag(z__3),
+ z__1
+ );
+ bli_zsets
+ (
+ bli_zreal(z__1),
+ bli_zimag(z__1),
+ stemp
+ );
+ i__2 = iy;
+ i__3 = iy;
+ bli_zsets
+ (
+ *c__ * bli_zreal(cy[i__3]),
+ *c__ * bli_zimag(cy[i__3]),
+ z__2
+ );
+ bla_d_cnjg(&z__4, s);
+ i__4 = ix;
+ bli_zsets
+ (
+ bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+ bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+ z__3
+ );
+ bli_zsets
+ (
+ bli_zreal(z__2) - bli_zreal(z__3),
+ bli_zimag(z__2) - bli_zimag(z__3),
+ z__1
+ );
+ bli_zsets
+ (
+ bli_zreal(z__1),
+ bli_zimag(z__1),
+ cy[i__2]
+ );
+ i__2 = ix;
+ bli_zsets
+ (
+ bli_zreal(stemp),
+ bli_zimag(stemp),
+ cx[i__2]
+ );
+#endif
+ ix += *incx;
+ iy += *incy;
+/* L10: */
+ }
+ return 0;
+
+/* Code for both increments equal to 1 */
+
+L20:
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = i__;
+#if 0
+ z__2.r = *c__ * cx[i__2].r;
+ z__2.i = *c__ * cx[i__2].i;
+ i__3 = i__;
+ z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+ z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+ z__1.r = z__2.r + z__3.r;
+ z__1.i = z__2.i + z__3.i;
+ stemp.r = z__1.r;
+ stemp.i = z__1.i;
+ i__2 = i__;
+ i__3 = i__;
+ z__2.r = *c__ * cy[i__3].r;
+ z__2.i = *c__ * cy[i__3].i;
+ bla_d_cnjg(&z__4, s);
+ i__4 = i__;
+ z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i;
+ z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+ z__1.r = z__2.r - z__3.r;
+ z__1.i = z__2.i - z__3.i;
+ cy[i__2].r = z__1.r;
+ cy[i__2].i = z__1.i;
+ i__2 = i__;
+ cx[i__2].r = stemp.r;
+ cx[i__2].i = stemp.i;
+#else
+ bli_zsets
+ (
+ *c__ * bli_zreal(cx[i__2]),
+ *c__ * bli_zimag(cx[i__2]),
+ z__2
+ );
+ i__3 = i__;
+ bli_zsets
+ (
+ bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+ bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+ z__3
+ );
+ bli_zsets
+ (
+ bli_zreal(z__2) + bli_zreal(z__3),
+ bli_zimag(z__2) + bli_zimag(z__3),
+ z__1
+ );
+ bli_zsets
+ (
+ bli_zreal(z__1),
+ bli_zimag(z__1),
+ stemp
+ );
+ i__2 = i__;
+ i__3 = i__;
+ bli_zsets
+ (
+ *c__ * bli_zreal(cy[i__3]),
+ *c__ * bli_zimag(cy[i__3]),
+ z__2
+ );
+ bla_d_cnjg(&z__4, s);
+ i__4 = i__;
+ bli_zsets
+ (
+ bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+ bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+ z__3
+ );
+ bli_zsets
+ (
+ bli_zreal(z__2) - bli_zreal(z__3),
+ bli_zimag(z__2) - bli_zimag(z__3),
+ z__1
+ );
+ bli_zsets
+ (
+ bli_zreal(z__1),
+ bli_zimag(z__1),
+ cy[i__2]
+ );
+ i__2 = i__;
+ bli_zsets
+ (
+ bli_zreal(stemp),
+ bli_zimag(stemp),
+ cx[i__2]
+ );
+#endif
+/* L30: */
+ }
+ return 0;
+} /* zrot_ */
+
+
#endif
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index ca4a4f9ac1..4e6aead4a8 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -38,5 +38,7 @@ BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const b
BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
+BLIS_EXPORT_BLAS int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s);
+BLIS_EXPORT_BLAS int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s);
#endif
diff --git a/frame/compat/f2c/other/crot.c b/frame/compat/f2c/other/crot.c
new file mode 100644
index 0000000000..e3e1282f4e
--- /dev/null
+++ b/frame/compat/f2c/other/crot.c
@@ -0,0 +1,227 @@
+/* crot.f -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*/
+
+/* =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/* http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CROT + dependencies */
+/* > */
+/* > [TGZ] */
+/* > */
+/* > [ZIP] */
+/* > */
+/* > [TXT] */
+/* > \endhtmlonly */
+
+/* Definition: */
+/* =========== */
+
+/* SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/* .. Scalar Arguments .. */
+/* INTEGER INCX, INCY, N */
+/* REAL C */
+/* COMPLEX S */
+/* .. */
+/* .. Array Arguments .. */
+/* COMPLEX CX( * ), CY( * ) */
+/* .. */
+
+
+/* > \par Purpose: */
+/* ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CROT applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/* Arguments: */
+/* ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* > N is INTEGER */
+/* > The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* > CX is COMPLEX array, dimension (N) */
+/* > On input, the vector X. */
+/* > On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* > INCX is INTEGER */
+/* > The increment between successive values of CX. INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* > CY is COMPLEX array, dimension (N) */
+/* > On input, the vector Y. */
+/* > On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* > INCY is INTEGER */
+/* > The increment between successive values of CY. INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* > C is REAL */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* > S is COMPLEX */
+/* > C and S define a rotation */
+/* > [ C S ] */
+/* > [ -conjg(S) C ] */
+/* > where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/* Authors: */
+/* ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/* ===================================================================== */
+/* Subroutine */ int crot_(integer *n, complex *cx, integer *incx, complex *
+ cy, integer *incy, real *c__, complex *s)
+{
+ /* System generated locals */
+ integer i__1, i__2, i__3, i__4;
+ complex q__1, q__2, q__3, q__4;
+
+ /* Builtin functions */
+ void r_cnjg(complex *, complex *);
+
+ /* Local variables */
+ integer i__, ix, iy;
+ complex stemp;
+
+
+/* -- LAPACK auxiliary routine -- */
+/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */
+/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/* .. Scalar Arguments .. */
+/* .. */
+/* .. Array Arguments .. */
+/* .. */
+
+/* ===================================================================== */
+
+/* .. Local Scalars .. */
+/* .. */
+/* .. Intrinsic Functions .. */
+/* .. */
+/* .. Executable Statements .. */
+
+ /* Parameter adjustments */
+ --cy;
+ --cx;
+
+ /* Function Body */
+ if (*n <= 0) {
+ return 0;
+ }
+ if (*incx == 1 && *incy == 1) {
+ goto L20;
+ }
+
+/* Code for unequal increments or equal increments not equal to 1 */
+
+ ix = 1;
+ iy = 1;
+ if (*incx < 0) {
+ ix = (-(*n) + 1) * *incx + 1;
+ }
+ if (*incy < 0) {
+ iy = (-(*n) + 1) * *incy + 1;
+ }
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = ix;
+ q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+ i__3 = iy;
+ q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+ i__3].i + s->i * cy[i__3].r;
+ q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+ stemp.r = q__1.r, stemp.i = q__1.i;
+ i__2 = iy;
+ i__3 = iy;
+ q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+ r_cnjg(&q__4, s);
+ i__4 = ix;
+ q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r *
+ cx[i__4].i + q__4.i * cx[i__4].r;
+ q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+ cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+ i__2 = ix;
+ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+ ix += *incx;
+ iy += *incy;
+/* L10: */
+ }
+ return 0;
+
+/* Code for both increments equal to 1 */
+
+L20:
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = i__;
+ q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+ i__3 = i__;
+ q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+ i__3].i + s->i * cy[i__3].r;
+ q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+ stemp.r = q__1.r, stemp.i = q__1.i;
+ i__2 = i__;
+ i__3 = i__;
+ q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+ r_cnjg(&q__4, s);
+ i__4 = i__;
+ q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r *
+ cx[i__4].i + q__4.i * cx[i__4].r;
+ q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+ cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+ i__2 = i__;
+ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+ }
+ return 0;
+} /* crot_ */
+
diff --git a/frame/compat/f2c/other/crot.f b/frame/compat/f2c/other/crot.f
new file mode 100644
index 0000000000..6dc771506f
--- /dev/null
+++ b/frame/compat/f2c/other/crot.f
@@ -0,0 +1,159 @@
+*> \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CROT + dependencies
+*>
+*> [TGZ]
+*>
+*> [ZIP]
+*>
+*> [TXT]
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+* .. Scalar Arguments ..
+* INTEGER INCX, INCY, N
+* REAL C
+* COMPLEX S
+* ..
+* .. Array Arguments ..
+* COMPLEX CX( * ), CY( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CROT applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*> CX is COMPLEX array, dimension (N)
+*> On input, the vector X.
+*> On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*> INCX is INTEGER
+*> The increment between successive values of CX. INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*> CY is COMPLEX array, dimension (N)
+*> On input, the vector Y.
+*> On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*> INCY is INTEGER
+*> The increment between successive values of CY. INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*> C is REAL
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*> S is COMPLEX
+*> C and S define a rotation
+*> [ C S ]
+*> [ -conjg(S) C ]
+*> where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+* =====================================================================
+ SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+* -- LAPACK auxiliary routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER INCX, INCY, N
+ REAL C
+ COMPLEX S
+* ..
+* .. Array Arguments ..
+ COMPLEX CX( * ), CY( * )
+* ..
+*
+* =====================================================================
+*
+* .. Local Scalars ..
+ INTEGER I, IX, IY
+ COMPLEX STEMP
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG
+* ..
+* .. Executable Statements ..
+*
+ IF( N.LE.0 )
+ $ RETURN
+ IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+ $ GO TO 20
+*
+* Code for unequal increments or equal increments not equal to 1
+*
+ IX = 1
+ IY = 1
+ IF( INCX.LT.0 )
+ $ IX = ( -N+1 )*INCX + 1
+ IF( INCY.LT.0 )
+ $ IY = ( -N+1 )*INCY + 1
+ DO 10 I = 1, N
+ STEMP = C*CX( IX ) + S*CY( IY )
+ CY( IY ) = C*CY( IY ) - CONJG( S )*CX( IX )
+ CX( IX ) = STEMP
+ IX = IX + INCX
+ IY = IY + INCY
+ 10 CONTINUE
+ RETURN
+*
+* Code for both increments equal to 1
+*
+ 20 CONTINUE
+ DO 30 I = 1, N
+ STEMP = C*CX( I ) + S*CY( I )
+ CY( I ) = C*CY( I ) - CONJG( S )*CX( I )
+ CX( I ) = STEMP
+ 30 CONTINUE
+ RETURN
+ END
diff --git a/frame/compat/f2c/other/zrot.c b/frame/compat/f2c/other/zrot.c
new file mode 100644
index 0000000000..0706f8b251
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.c
@@ -0,0 +1,227 @@
+/* zrot.f -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*/
+
+/* =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/* http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZROT + dependencies */
+/* > */
+/* > [TGZ] */
+/* > */
+/* > [ZIP] */
+/* > */
+/* > [TXT] */
+/* > \endhtmlonly */
+
+/* Definition: */
+/* =========== */
+
+/* SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/* .. Scalar Arguments .. */
+/* INTEGER INCX, INCY, N */
+/* DOUBLE PRECISION C */
+/* COMPLEX*16 S */
+/* .. */
+/* .. Array Arguments .. */
+/* COMPLEX*16 CX( * ), CY( * ) */
+/* .. */
+
+
+/* > \par Purpose: */
+/* ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZROT applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/* Arguments: */
+/* ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* > N is INTEGER */
+/* > The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* > CX is COMPLEX*16 array, dimension (N) */
+/* > On input, the vector X. */
+/* > On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* > INCX is INTEGER */
+/* > The increment between successive values of CX. INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* > CY is COMPLEX*16 array, dimension (N) */
+/* > On input, the vector Y. */
+/* > On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* > INCY is INTEGER */
+/* > The increment between successive values of CY. INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* > C is DOUBLE PRECISION */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* > S is COMPLEX*16 */
+/* > C and S define a rotation */
+/* > [ C S ] */
+/* > [ -conjg(S) C ] */
+/* > where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/* Authors: */
+/* ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/* ===================================================================== */
+/* Subroutine */ int zrot_(integer *n, doublecomplex *cx, integer *incx,
+ doublecomplex *cy, integer *incy, doublereal *c__, doublecomplex *s)
+{
+ /* System generated locals */
+ integer i__1, i__2, i__3, i__4;
+ doublecomplex z__1, z__2, z__3, z__4;
+
+ /* Builtin functions */
+ void d_cnjg(doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ integer i__, ix, iy;
+ doublecomplex stemp;
+
+
+/* -- LAPACK auxiliary routine -- */
+/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */
+/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/* .. Scalar Arguments .. */
+/* .. */
+/* .. Array Arguments .. */
+/* .. */
+
+/* ===================================================================== */
+
+/* .. Local Scalars .. */
+/* .. */
+/* .. Intrinsic Functions .. */
+/* .. */
+/* .. Executable Statements .. */
+
+ /* Parameter adjustments */
+ --cy;
+ --cx;
+
+ /* Function Body */
+ if (*n <= 0) {
+ return 0;
+ }
+ if (*incx == 1 && *incy == 1) {
+ goto L20;
+ }
+
+/* Code for unequal increments or equal increments not equal to 1 */
+
+ ix = 1;
+ iy = 1;
+ if (*incx < 0) {
+ ix = (-(*n) + 1) * *incx + 1;
+ }
+ if (*incy < 0) {
+ iy = (-(*n) + 1) * *incy + 1;
+ }
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = ix;
+ z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+ i__3 = iy;
+ z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+ i__3].i + s->i * cy[i__3].r;
+ z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+ stemp.r = z__1.r, stemp.i = z__1.i;
+ i__2 = iy;
+ i__3 = iy;
+ z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+ d_cnjg(&z__4, s);
+ i__4 = ix;
+ z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r *
+ cx[i__4].i + z__4.i * cx[i__4].r;
+ z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+ cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+ i__2 = ix;
+ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+ ix += *incx;
+ iy += *incy;
+/* L10: */
+ }
+ return 0;
+
+/* Code for both increments equal to 1 */
+
+L20:
+ i__1 = *n;
+ for (i__ = 1; i__ <= i__1; ++i__) {
+ i__2 = i__;
+ z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+ i__3 = i__;
+ z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+ i__3].i + s->i * cy[i__3].r;
+ z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+ stemp.r = z__1.r, stemp.i = z__1.i;
+ i__2 = i__;
+ i__3 = i__;
+ z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+ d_cnjg(&z__4, s);
+ i__4 = i__;
+ z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r *
+ cx[i__4].i + z__4.i * cx[i__4].r;
+ z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+ cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+ i__2 = i__;
+ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+ }
+ return 0;
+} /* zrot_ */
+
diff --git a/frame/compat/f2c/other/zrot.f b/frame/compat/f2c/other/zrot.f
new file mode 100644
index 0000000000..28fc8ec1de
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.f
@@ -0,0 +1,159 @@
+*> \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZROT + dependencies
+*>
+*> [TGZ]
+*>
+*> [ZIP]
+*>
+*> [TXT]
+*> \endhtmlonly
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+* .. Scalar Arguments ..
+* INTEGER INCX, INCY, N
+* DOUBLE PRECISION C
+* COMPLEX*16 S
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 CX( * ), CY( * )
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZROT applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*> CX is COMPLEX*16 array, dimension (N)
+*> On input, the vector X.
+*> On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*> INCX is INTEGER
+*> The increment between successive values of CX. INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*> CY is COMPLEX*16 array, dimension (N)
+*> On input, the vector Y.
+*> On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*> INCY is INTEGER
+*> The increment between successive values of CY. INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*> C is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*> S is COMPLEX*16
+*> C and S define a rotation
+*> [ C S ]
+*> [ -conjg(S) C ]
+*> where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+* =====================================================================
+ SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+* -- LAPACK auxiliary routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER INCX, INCY, N
+ DOUBLE PRECISION C
+ COMPLEX*16 S
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 CX( * ), CY( * )
+* ..
+*
+* =====================================================================
+*
+* .. Local Scalars ..
+ INTEGER I, IX, IY
+ COMPLEX*16 STEMP
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC DCONJG
+* ..
+* .. Executable Statements ..
+*
+ IF( N.LE.0 )
+ $ RETURN
+ IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+ $ GO TO 20
+*
+* Code for unequal increments or equal increments not equal to 1
+*
+ IX = 1
+ IY = 1
+ IF( INCX.LT.0 )
+ $ IX = ( -N+1 )*INCX + 1
+ IF( INCY.LT.0 )
+ $ IY = ( -N+1 )*INCY + 1
+ DO 10 I = 1, N
+ STEMP = C*CX( IX ) + S*CY( IY )
+ CY( IY ) = C*CY( IY ) - DCONJG( S )*CX( IX )
+ CX( IX ) = STEMP
+ IX = IX + INCX
+ IY = IY + INCY
+ 10 CONTINUE
+ RETURN
+*
+* Code for both increments equal to 1
+*
+ 20 CONTINUE
+ DO 30 I = 1, N
+ STEMP = C*CX( I ) + S*CY( I )
+ CY( I ) = C*CY( I ) - DCONJG( S )*CX( I )
+ CX( I ) = STEMP
+ 30 CONTINUE
+ RETURN
+ END
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 37c5af3984..6a5a5a569a 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -156,6 +156,11 @@ CNTX_INIT_PROTS( rv32iv )
CNTX_INIT_PROTS( rv64iv )
#endif
+// -- SiFive architectures --
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+CNTX_INIT_PROTS( sifive_x280 )
+#endif
// -- Generic --
@@ -296,6 +301,12 @@ CNTX_INIT_PROTS( generic )
#include "bli_family_bgq.h"
#endif
+// -- SiFive families --
+
+#ifdef BLIS_FAMILY_SIFIVE_X280
+#include "bli_family_sifive_x280.h"
+#endif
+
// -- Generic --
#ifdef BLIS_FAMILY_GENERIC
@@ -386,5 +397,12 @@ CNTX_INIT_PROTS( generic )
#include "bli_kernels_rviv.h"
#endif
+// -- SiFive RISC-V architectures --
+
+#ifdef BLIS_KERNELS_SIFIVE_X280
+#include "bli_kernels_sifive_x280.h"
+#endif
+
+
#endif
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 62ccd0c41a..9771ec5791 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -975,6 +975,9 @@ typedef enum
BLIS_ARCH_RV32IV,
BLIS_ARCH_RV64IV,
+ // SiFive
+ BLIS_ARCH_SIFIVE_X280,
+
// Generic architecture/configuration
BLIS_ARCH_GENERIC,
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 96e21b99f8..56f5aba9a9 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -67,6 +67,17 @@ typedef struct barrier_s barrier_t;
#endif
#endif
+// Define hpx_barrier_t, which is specific to the barrier used in HPX
+// implementation. This needs to be done first since it is (potentially)
+// used within the definition of thrcomm_t below.
+
+#ifdef BLIS_ENABLE_HPX
+typedef struct hpx_barrier_t
+{
+ void* handle;
+} hpx_barrier_t;
+#endif
+
// Define the thrcomm_t structure, which will be common to all threading
// implementations.
@@ -124,9 +135,7 @@ typedef struct thrcomm_s
// -- Fields specific to HPX --
#ifdef BLIS_ENABLE_HPX
- #ifdef BLIS_USE_HPX_BARRIER
- hpx::barrier<> * barrier;
- #endif
+ hpx_barrier_t barrier;
#endif
} thrcomm_t;
diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp
index 323871ef80..0947dc81df 100644
--- a/frame/thread/bli_thrcomm_hpx.cpp
+++ b/frame/thread/bli_thrcomm_hpx.cpp
@@ -36,43 +36,36 @@
#ifdef BLIS_ENABLE_HPX
+#include
extern "C" {
-#ifdef BLIS_USE_HPX_BARRIER
-
// Define the pthread_barrier_t implementations of the init, cleanup, and
// barrier functions.
-void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
+void hpx_barrier_init( hpx_barrier_t* barrier, dim_t n_threads )
{
- if ( comm == nullptr ) return;
-
- //comm->sent_object = nullptr;
- //comm->n_threads = n_threads;
- comm->ti = BLIS_HPX;
- //comm->barrier_sense = 0;
- //comm->barrier_threads_arrived = 0;
-
- comm->barrier = new hpx:barrier<>();
+ if ( barrier == nullptr ) return;
+ barrier->handle = new hpx::barrier<>( n_threads );
}
-void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
+void hpx_barrier_destroy( hpx_barrier_t* barrier )
{
- if ( comm == nullptr ) return;
+ if ( barrier == nullptr ) return;
- delete comm->barrier;
-}
+ auto* barrier_ = reinterpret_cast*>( barrier->handle );
+ barrier->handle = nullptr;
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
-{
- comm->barrier->arrive_and_wait();
+ delete barrier_;
}
-#else
+void hpx_barrier_arrive_and_wait( hpx_barrier_t* barrier )
+{
+ if ( barrier == nullptr ) return;
+ auto* barrier_ = reinterpret_cast*>( barrier->handle );
-// Define the non-hpx::barrier implementations of the init, cleanup,
-// and barrier functions. These are the default unless the hpx::barrier
-// versions are requested at compile-time.
+ if ( barrier_ == nullptr ) return;
+ barrier_->arrive_and_wait();
+}
void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
{
@@ -81,22 +74,24 @@ void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
comm->sent_object = nullptr;
comm->n_threads = n_threads;
comm->ti = BLIS_HPX;
- comm->barrier_sense = 0;
- comm->barrier_threads_arrived = 0;
+ // comm->barrier_sense = 0;
+ // comm->barrier_threads_arrived = 0;
+
+ hpx_barrier_init( &comm->barrier, n_threads );
}
void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
{
+ if ( comm == nullptr ) return;
+ hpx_barrier_destroy( &comm->barrier );
}
void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm )
{
- bli_thrcomm_barrier_atomic( t_id, comm );
+ hpx_barrier_arrive_and_wait( &comm->barrier );
}
-} // extern "C"
-
-#endif
+}
#endif
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index f69a0f5d7e..baf2eb3f2d 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -36,9 +36,10 @@
#ifdef BLIS_ENABLE_HPX
-#include
-#include
+#include
#include
+#include
+#include
extern "C"
{
@@ -56,12 +57,21 @@ void bli_thread_launch_hpx
pool_t* gl_comm_pool = nullptr;
thrcomm_t* gl_comm = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
- auto irange = hpx::util::counting_shape(n_threads);
-
- hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange),
- [&gl_comm, &func, ¶ms](const dim_t tid)
+ // Execute func on hpx-runtime with n_threads.
+ hpx::threads::run_as_hpx_thread([&]()
{
- func( gl_comm, tid, params );
+ std::vector> futures;
+ futures.reserve(n_threads);
+
+ for (dim_t tid = 0; tid < n_threads; ++tid)
+ {
+ futures.push_back(hpx::async([tid, &gl_comm, &func, ¶ms]()
+ {
+ func( gl_comm, tid, params );
+ }));
+ }
+
+ hpx::wait_all(futures);
});
// Free the global communicator, because the root thrinfo_t node
@@ -76,7 +86,7 @@ void bli_thread_initialize_hpx( int argc, char** argv )
int bli_thread_finalize_hpx()
{
- hpx::apply([]() { hpx::finalize(); });
+ hpx::post([]() { hpx::finalize(); });
return hpx::stop();
}
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
new file mode 100644
index 0000000000..2b7ad6fe7d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict x_, inc_t incx, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* cntx \
+)
+
+#define ADDV(...) ADDV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef ADDV
+#undef ADDV_
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..d5343befe0
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+ // Computes y := y + conjx(x)
+ (void) cntx;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ yvec_real = VFADD_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE)
+ yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+ else
+ yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..d4e7d4a45e
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
@@ -0,0 +1,78 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+ // Computes y = y + conjx(x)
+ // == y + x (real case)
+
+ (void) cntx;
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ yvec = VFADD_VV(PREC, LMUL)(yvec, xvec, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL) (y, yvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
new file mode 100644
index 0000000000..c423dd131d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
@@ -0,0 +1,293 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+ dim_t *index, const cntx_t *cntx) {
+ // assumes 64-bit index
+ (void)cntx;
+ const float* restrict x = x_;
+
+ if (n <= 1) {
+ *index = 0;
+ return;
+ }
+ incx *= 4;
+ size_t avl = n;
+ size_t offset = 0;
+ bool first = true;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+ : "=r"(vl)
+ : "r"(avl));
+ if (incx == 4)
+ __asm__("vle32.v v24, (%0)" : : "r"(x));
+ else
+ __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+ // check for NaN
+ __asm__ volatile("vmfne.vv v0, v24, v24");
+ dim_t nan_index;
+ __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+ if (nan_index != -1) {
+ *index = nan_index + offset;
+ return;
+ }
+ if (first) {
+ __asm__("vfabs.v v8, v24");
+ // keep vl same, change SEW and LMUL
+ __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+ __asm__("vid.v v16");
+ first = false;
+ } else {
+ __asm__("vfabs.v v24, v24");
+ __asm__("vmflt.vv v0, v8, v24");
+ __asm__("vmerge.vvm v8, v8, v24, v0");
+ // keep vl same, change SEW and LMUL
+ __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+ __asm__("vid.v v24");
+ __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+ __asm__("vmerge.vvm v16, v16, v24, v0");
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ offset += vl;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+ __asm__("vmv.s.x v0, zero");
+ __asm__("vfredmax.vs v0, v8, v0");
+ __asm__("vrgather.vi v24, v0, 0");
+ __asm__("vmfeq.vv v0, v8, v24");
+ __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+ uint64_t imax = -1;
+ __asm__("vmv.s.x v24, %0" : : "r"(imax));
+ __asm__("vredminu.vs v24, v16, v24, v0.t");
+ __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+ __asm__("vse64.v v24, (%0)" : : "r"(index));
+ return;
+}
+
+void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+ dim_t *index, const cntx_t *cntx) {
+ // assumes 64-bit index
+ (void)cntx;
+ const double* restrict x = x_;
+
+ if (n <= 1) {
+ *index = 0;
+ return;
+ }
+ incx *= 8;
+ size_t avl = n;
+ size_t offset = 0;
+ bool first = true;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma"
+ : "=r"(vl)
+ : "r"(avl));
+ if (incx == 8)
+ __asm__("vle64.v v24, (%0)" : : "r"(x));
+ else
+ __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+ // check for NaN
+ __asm__ volatile("vmfne.vv v0, v24, v24");
+ dim_t nan_index;
+ __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+ if (nan_index != -1) {
+ *index = nan_index + offset;
+ return;
+ }
+ if (first) {
+ __asm__("vfabs.v v8, v24");
+ __asm__("vid.v v16");
+ first = false;
+ } else {
+ __asm__("vfabs.v v24, v24");
+ __asm__("vmflt.vv v0, v8, v24");
+ __asm__("vmerge.vvm v8, v8, v24, v0");
+ __asm__("vid.v v24");
+ __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+ __asm__("vmerge.vvm v16, v16, v24, v0");
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ offset += vl;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n));
+ __asm__("vmv.s.x v0, zero");
+ __asm__("vfredmax.vs v0, v8, v0");
+ __asm__("vrgather.vi v24, v0, 0");
+ __asm__("vmfeq.vv v0, v8, v24");
+ uint64_t imax = -1;
+ __asm__("vmv.s.x v24, %0" : : "r"(imax));
+ __asm__("vredminu.vs v24, v16, v24, v0.t");
+ __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+ __asm__("vse64.v v24, (%0)" : : "r"(index));
+ return;
+}
+
+void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+ dim_t *index, const cntx_t *cntx) {
+ // assumes 64-bit index
+ (void)cntx;
+ const scomplex* restrict x = x_;
+
+ if (n <= 1) {
+ *index = 0;
+ return;
+ }
+ incx *= 8;
+ size_t avl = n;
+ size_t offset = 0;
+ bool first = true;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+ : "=r"(vl)
+ : "r"(avl));
+ if (incx == 8)
+ __asm__("vlseg2e32.v v24, (%0)" : : "r"(x));
+ else
+ __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfabs.v v24, v24");
+ __asm__("vfabs.v v28, v28");
+ __asm__("vfadd.vv v24, v24, v28");
+ // check for NaN
+ __asm__ volatile("vmfne.vv v0, v24, v24");
+ dim_t nan_index;
+ __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+ if (nan_index != -1) {
+ *index = nan_index + offset;
+ return;
+ }
+ if (first) {
+ __asm__("vmv4r.v v8, v24");
+ // keep vl same, change SEW and LMUL
+ __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+ __asm__("vid.v v16");
+ first = false;
+ } else {
+ __asm__("vmflt.vv v0, v8, v24");
+ __asm__("vmerge.vvm v8, v8, v24, v0");
+ // keep vl same, change SEW and LMUL
+ __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+ __asm__("vid.v v24");
+ __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+ __asm__("vmerge.vvm v16, v16, v24, v0");
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ offset += vl;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+ __asm__("vmv.s.x v0, zero");
+ __asm__("vfredmax.vs v0, v8, v0");
+ __asm__("vrgather.vi v24, v0, 0");
+ __asm__("vmfeq.vv v0, v8, v24");
+ __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+ uint64_t imax = -1;
+ __asm__("vmv.s.x v24, %0" : : "r"(imax));
+ __asm__("vredminu.vs v24, v16, v24, v0.t");
+ __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+ __asm__("vse64.v v24, (%0)" : : "r"(index));
+ return;
+}
+
+void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+ dim_t *index, const cntx_t *cntx) {
+ // assumes 64-bit index
+ (void)cntx;
+ const dcomplex* restrict x = x_;
+
+ if (n <= 1) {
+ *index = 0;
+ return;
+ }
+ incx *= 16;
+ size_t avl = n;
+ size_t offset = 0;
+ bool first = true;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma"
+ : "=r"(vl)
+ : "r"(avl));
+ if (incx == 16)
+ __asm__("vlseg2e64.v v24, (%0)" : : "r"(x));
+ else
+ __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfabs.v v24, v24");
+ __asm__("vfabs.v v28, v28");
+ __asm__("vfadd.vv v24, v24, v28");
+ // check for NaN
+ __asm__ volatile("vmfne.vv v0, v24, v24");
+ dim_t nan_index;
+ __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+ if (nan_index != -1) {
+ *index = nan_index + offset;
+ return;
+ }
+ if (first) {
+ __asm__("vmv4r.v v8, v24");
+ __asm__("vid.v v16");
+ first = false;
+ } else {
+ __asm__("vmflt.vv v0, v8, v24");
+ __asm__("vmerge.vvm v8, v8, v24, v0");
+ __asm__("vid.v v24");
+ __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+ __asm__("vmerge.vvm v16, v16, v24, v0");
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ offset += vl;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n));
+ __asm__("vmv.s.x v0, zero");
+ __asm__("vfredmax.vs v0, v8, v0");
+ __asm__("vrgather.vi v24, v0, 0");
+ __asm__("vmfeq.vv v0, v8, v24");
+ uint64_t imax = -1;
+ __asm__("vmv.s.x v24, %0" : : "r"(imax));
+ __asm__("vredminu.vs v24, v16, v24, v0.t");
+ __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+ __asm__("vse64.v v24, (%0)" : : "r"(index));
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..3b29f898df
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
@@ -0,0 +1,129 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict beta_, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* cntx \
+)
+
+#define AXPBYV(...) AXPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
+#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr
+#define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPBYV
+#undef AXPBYV_
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..31fc584b97
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,121 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+ // Computes y := beta * y + alpha * conjx(x)
+
+ if (n <= 0) return;
+
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict beta = beta_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (alpha->real == 0 && alpha->imag == 0 && beta->real == 0 && beta->imag == 0){
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+ return;
+ }
+ if (alpha->real == 0 && alpha->imag == 0){
+ SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+ return;
+ }
+ if (beta->real == 0 && beta->imag == 0){
+ SCAL2V(PRECISION_CHAR)(conjx, n, alpha, x, incx, y, incy, cntx);
+ return;
+ }
+
+ // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we
+ // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+ // TO DO (optimization): special cases for alpha = +-1, +-i, beta = +-1, +-i
+
+ // alpha and beta are both nonzero
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, temp_real, temp_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ // Computed as:
+ // y.real = beta.real * y.real - beta.imag * y.imag + alpha.real * x.real - alpha.imag * conj(x.imag)
+ // y.imag = beta.real * y.imag + beta.imag * y.real + alpha.imag * x.real + alpha.real * conj(x.imag)
+ temp_real = VFMUL_VF(PREC, LMUL) (yvec_real, beta->real, vl);
+ temp_imag = VFMUL_VF(PREC, LMUL) (yvec_imag, beta->real, vl);
+ temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, beta->imag, yvec_imag, vl);
+ temp_imag = VFMACC_VF(PREC, LMUL) (temp_imag, beta->imag, yvec_real, vl);
+ yvec_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->real, xvec_real, vl);
+ yvec_imag = VFMACC_VF(PREC, LMUL) (temp_imag, alpha->imag, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE) {
+ yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+ } else {
+ yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+ }
+
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+}
+
+#endif // AXPBYV
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..33eafc5d12
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,98 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+ // Computes y := beta * y + alpha * conjx(x)
+ // == beta * y + alpha * x (real case)
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict beta = beta_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ if (*alpha == 0 && *beta == 0){
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+ return;
+ }
+ if (*alpha == 0){
+ SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+ return;
+ }
+ if (*beta == 0){
+ SCAL2V(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, y, incy, cntx);
+ return;
+ }
+
+ // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we
+ // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+ // TO DO (optimization): special cases for alpha = +-1, beta = +-1
+
+ // alpha and beta are both nonzero
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL)(x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL)(y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ yvec = VFMUL_VF(PREC, LMUL) (yvec, *beta, vl);
+ yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL)(y, yvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+}
+
+#endif // AXPYBV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..3f9ebd3b04
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
@@ -0,0 +1,119 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ const T* restrict x_, inc_t incx, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* cntx \
+)
+
+#define AXPYV(...) AXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPYV
+#undef AXPYV_
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..dc520d2125
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,94 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+ // Computes y := y + alpha * conjx(x)
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+ if (alpha->real == 0 && alpha->imag == 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ yvec_real = VFMACC_VF(PREC, LMUL)( yvec_real, alpha->real, xvec_real, vl);
+ yvec_imag = VFMACC_VF(PREC, LMUL)( yvec_imag, alpha->imag, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE){
+ yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+ } else {
+ yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+ }
+
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..0c2cda842f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
@@ -0,0 +1,79 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+ // Computes y = y + alpha * conj(x)
+ // == y + alpha * x (real case)
+
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+ if (*alpha == 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL) (y, yvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
new file mode 100644
index 0000000000..3571877759
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
@@ -0,0 +1,272 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)conjx;
+ (void)cntx;
+ const float* restrict x = x_;
+ float* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+ if (incy == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)conjx;
+ (void)cntx;
+ const double* restrict x = x_;
+ double* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+ if (incy == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ const scomplex* restrict x = x_;
+ scomplex* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ if (conjx == BLIS_NO_CONJUGATE) {
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ } else {
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+ __asm__("vfneg.v v4, v4");
+
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define SH_ADD "sh3add "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ const dcomplex* restrict x = x_;
+ dcomplex* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE &&
+ incy == 2 * FLT_SIZE) {
+ size_t avl = 2 * n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl));
+ __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl));
+ avl -= vl;
+ }
+ } else {
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+ if (conjx == BLIS_CONJUGATE)
+ __asm__("vfneg.v v4, v4");
+
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
new file mode 100644
index 0000000000..0dc8565400
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\
+ conj_t conjxt, \
+ conj_t conjy, \
+ dim_t n, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict y_, inc_t incy, \
+ T* restrict rho_, \
+ const cntx_t* cntx \
+)
+
+#define DOTV(...) DOTV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTV
+#undef DOTV_
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..250fab46e6
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
@@ -0,0 +1,116 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+ // Computes rho = conjxt(x)^T * conjy(y)
+ (void) cntx;
+ DATATYPE* restrict rho = rho_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+
+ if (n <= 0) {
+ rho->real = 0;
+ rho->imag = 0;
+ return;
+ }
+
+ // Instead of conjugating x, switch conjugation on y
+ // and conjugate rho at the end
+ conj_t conjrho = conjxt;
+ if (conjxt == BLIS_CONJUGATE)
+ bli_toggle_conj(&conjy); // Switch conjugation of y
+
+ RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+ size_t avl = n;
+ bool first = true;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ if (first) {
+ acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+ acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+ first = false;
+ } else {
+ acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+ }
+ if (conjy == BLIS_NO_CONJUGATE) {
+ acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+ } else {
+ acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+ acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+ }
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+
+ RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+ RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+ sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+ sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+ if (conjrho == BLIS_CONJUGATE) {
+ sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+ }
+ rho->real = VFMV_F_S(PREC)(sum_real);
+ rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..0ec8e6328a
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
@@ -0,0 +1,87 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+ // Computes rho = conjxt(x)^T * conjy(y)
+ // == x^T * y (real case)
+ (void) cntx;
+ (void) conjxt; // Suppress unused parameter warnings
+ (void) conjy;
+ DATATYPE* restrict rho = rho_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+
+ if (n <= 0) {
+ *rho = 0;
+ return;
+ }
+
+ RVV_TYPE_F(PREC, LMUL) acc;
+ size_t avl = n;
+ bool first = true;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ if (first) {
+ acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+ first = false;
+ } else
+ acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+
+ RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+ sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+ *rho = VFMV_F_S(PREC)(sum);
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
new file mode 100644
index 0000000000..048f8d2983
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
@@ -0,0 +1,130 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\
+ conj_t conjxt, \
+ conj_t conjy, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict y_, inc_t incy, \
+ const T* restrict beta_, \
+ T* restrict rho_, \
+ const cntx_t* cntx \
+)
+
+#define DOTXV(...) DOTXV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+#undef DOTXV
+#undef DOTXV_
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..8245e8e057
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
@@ -0,0 +1,130 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+ // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+ (void) cntx;
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict beta = beta_;
+ DATATYPE* restrict rho = rho_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+
+ if (beta->real == 0 && beta->imag == 0){
+ rho->real = 0;
+ rho->imag = 0;
+ } else if (!(beta->real == 1 && beta->imag == 0)) {
+ DATATYPE temp = *rho;
+ rho->real = rho->real * beta->real - rho->imag * beta->imag;
+ rho->imag = temp.real * beta->imag + rho->imag * beta->real;
+ }
+
+ if (n <= 0 || (alpha->real == 0 && alpha->imag == 0))
+ return;
+
+ // Instead of conjugating x, switch conjugation on y
+ // and conjugate dot product at the end
+ conj_t conjsum = conjxt;
+ if (conjxt == BLIS_CONJUGATE)
+ bli_toggle_conj(&conjy); // Switch conjugation of y
+
+ // Compute dot product
+ RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+ size_t avl = n;
+ bool first = true;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ if (first) {
+ acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+ acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+ first = false;
+ } else {
+ acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+ }
+ if (conjy == BLIS_NO_CONJUGATE) {
+ acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+ } else {
+ acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+ acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+ }
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+
+ RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+ RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+ sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+ sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+ if (conjsum == BLIS_CONJUGATE) {
+ sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+ }
+ DATATYPE dot = {VFMV_F_S(PREC)(sum_real), VFMV_F_S(PREC)(sum_imag)};
+
+ // Accumulate alpha * dot
+ rho->real = fma( alpha->real, dot.real, rho->real);
+ rho->real = fma(-alpha->imag, dot.imag, rho->real);
+ rho->imag = fma( alpha->imag, dot.real, rho->imag);
+ rho->imag = fma( alpha->real, dot.imag, rho->imag);
+
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..f9d9346973
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
@@ -0,0 +1,94 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+ // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+ // == beta * rho + alpha * x^T * y (real case)
+
+ (void) cntx;
+ (void) conjxt; // Suppress unused parameter warnings
+ (void) conjy;
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict beta = beta_;
+ DATATYPE* restrict rho = rho_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+
+ if (*beta == 0)
+ *rho = 0;
+ else if (*beta != 1.0f)
+ *rho *= *beta;
+
+ if (n <= 0 || *alpha == 0)
+ return;
+
+ // Compute dot product
+ RVV_TYPE_F(PREC, LMUL) acc;
+ size_t avl = n;
+ bool first = true;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ if (first) {
+ acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+ first = false;
+ } else
+ acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+
+ RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+ sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+ *rho = fma(*alpha, VFMV_F_S(PREC)(sum), *rho);
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
new file mode 100644
index 0000000000..cbca885929
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
@@ -0,0 +1,221 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ float* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ float one = 1.f;
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+ incx *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ __asm__("vfrdiv.vf v0, v0, f0");
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfrdiv.vf v0, v0, f0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ double* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ double one = 1.;
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+ incx *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ __asm__("vfrdiv.vf v0, v0, f0");
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfrdiv.vf v0, v0, f0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ scomplex* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ __asm__("vfneg.v v4, v4");
+ __asm__("vfmul.vv v8, v0, v0");
+ __asm__("vfmacc.vv v8, v4, v4");
+ __asm__("vfdiv.vv v0, v0, v8");
+ __asm__("vfdiv.vv v4, v4, v8");
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfneg.v v4, v4");
+ __asm__("vfmul.vv v8, v0, v0");
+ __asm__("vfmacc.vv v8, v4, v4");
+ __asm__("vfdiv.vv v0, v0, v8");
+ __asm__("vfdiv.vv v4, v4, v8");
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ dcomplex* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ __asm__("vfneg.v v4, v4");
+ __asm__("vfmul.vv v8, v0, v0");
+ __asm__("vfmacc.vv v8, v4, v4");
+ __asm__("vfdiv.vv v0, v0, v8");
+ __asm__("vfdiv.vv v4, v4, v8");
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfneg.v v4, v4");
+ __asm__("vfmul.vv v8, v0, v0");
+ __asm__("vfmacc.vv v8, v4, v4");
+ __asm__("vfdiv.vv v0, v0, v8");
+ __asm__("vfdiv.vv v4, v4, v8");
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
new file mode 100644
index 0000000000..51edc92214
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
@@ -0,0 +1,266 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FDIV "fdiv.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)conjalpha;
+ (void)cntx;
+ const float* restrict alpha = alpha_;
+ float* restrict x = x_;
+ if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
+ return;
+
+ float one = 1.f;
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+ __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+ __asm__(FDIV "f0, f0, f1");
+ incx *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FDIV "fdiv.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)conjalpha;
+ (void)cntx;
+ const double* restrict alpha = alpha_;
+ double* restrict x = x_;
+ if (n <= 0 || *alpha == 0. || *alpha == 1.)
+ return;
+
+ double one = 1.;
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+ __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+ __asm__(FDIV "f0, f0, f1");
+ incx *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FDIV "fdiv.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ const scomplex* restrict alpha = alpha_;
+ scomplex* restrict x = x_;
+ if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
+ return;
+
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FMUL "f2, f0, f0");
+ __asm__(FMADD "f2, f1, f1, f2");
+ __asm__(FDIV "f0, f0, f2");
+ __asm__(FDIV "f1, f1, f2");
+ if (conjalpha == BLIS_NO_CONJUGATE)
+ __asm__(FNEG "f1, f1");
+ incx *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ __asm__("vfmul.vf v8, v0, f0");
+ __asm__("vfmul.vf v12, v4, f0");
+ __asm__("vfnmsac.vf v8, f1, v4");
+ __asm__("vfmacc.vf v12, f1, v0");
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfmul.vf v8, v0, f0");
+ __asm__("vfmul.vf v12, v4, f0");
+ __asm__("vfnmsac.vf v8, f1, v4");
+ __asm__("vfmacc.vf v12, f1, v0");
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FDIV
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FDIV "fdiv.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx,
+ const cntx_t *cntx) {
+ (void)cntx;
+ const dcomplex* restrict alpha = alpha_;
+ dcomplex* restrict x = x_;
+ if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
+ return;
+
+ __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FMUL "f2, f0, f0");
+ __asm__(FMADD "f2, f1, f1, f2");
+ __asm__(FDIV "f0, f0, f2");
+ __asm__(FDIV "f1, f1, f2");
+ if (conjalpha == BLIS_NO_CONJUGATE)
+ __asm__(FNEG "f1, f1");
+ incx *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ __asm__("vfmul.vf v8, v0, f0");
+ __asm__("vfmul.vf v12, v4, f0");
+ __asm__("vfnmsac.vf v8, f1, v4");
+ __asm__("vfmacc.vf v12, f1, v0");
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+ } else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("vfmul.vf v8, v0, f0");
+ __asm__("vfmul.vf v12, v4, f0");
+ __asm__("vfnmsac.vf v8, f1, v4");
+ __asm__("vfmacc.vf v12, f1, v0");
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+ }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
new file mode 100644
index 0000000000..cd2dd2c188
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
@@ -0,0 +1,124 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ const T* restrict x_, inc_t incx, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* cntx \
+)
+
+#define SCAL2V(...) SCAL2V_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCAL2V
+#undef SCAL2V_
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..4a25ce3e32
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
@@ -0,0 +1,100 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+ // Computes y = alpha * conjx(x)
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+ if (alpha->real == 0 && alpha->imag == 0) {
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+ return;
+ }
+
+ if (alpha->real == 1 && alpha->imag == 0) {
+ COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+ return;
+ }
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+ yvec_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+ yvec_imag = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->imag, vl);
+ if (conjx == BLIS_NO_CONJUGATE) {
+ yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFMACC_VF( PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+ } else {
+ yvec_real = VFMACC_VF( PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+ yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+ }
+
+ // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use
+ // __riscv_vcreate_v_f once they become available in LLVM.
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wuninitialized"
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+ #pragma GCC diagnostic pop
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..7084e15cf5
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
@@ -0,0 +1,82 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+ // Computes y = alpha * conjx(x)
+ // == alpha * x (real case)
+
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+ if (*alpha == 0) {
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+ return;
+ }
+
+ if (*alpha == 1) {
+ COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+ return;
+ }
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL) (y, xvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
new file mode 100644
index 0000000000..b5788d632d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\
+ conj_t conjalpha, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ T* restrict x_, inc_t incx, \
+ const cntx_t* cntx \
+)
+
+#define SCALV(...) SCALV_(__VA_ARGS__)
+
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCALV
+#undef SCALV_
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..c6803c9676
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+ // Computes x = conjalpha(alpha) * x
+ const DATATYPE* restrict alpha = alpha_;
+ DATATYPE* restrict x = x_;
+
+ if (n <= 0 || (alpha->real == 1 && alpha->imag == 0)) return;
+
+ if (alpha->real == 0 && alpha->imag==0){
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+ return;
+ }
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+ RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+ RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL)(xvec_imag, alpha->real, vl);
+ if (conjalpha == BLIS_NO_CONJUGATE) {
+ temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, alpha->imag, xvec_imag, vl);
+ temp_imag = VFMACC_VF(PREC, LMUL)( temp_imag, alpha->imag, xvec_real, vl);
+ } else {
+ temp_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->imag, xvec_imag, vl);
+ temp_imag = VFNMSAC_VF(PREC, LMUL)(temp_imag, alpha->imag, xvec_real, vl);
+ }
+
+ xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, temp_real);
+ xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, temp_imag);
+
+ if (incx == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, xvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, xvec, vl);
+
+ x += vl*incx;
+ avl -= vl;
+ }
+
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..2b4e31d359
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
@@ -0,0 +1,76 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+ // Computes x = conjalpha(alpha) * x
+ // == alpha * x (real case)
+
+ (void) conjalpha; // Suppress unused parameter warnings
+ const DATATYPE* restrict alpha = alpha_;
+ DATATYPE* restrict x = x_;
+
+ if (n <= 0 || *alpha == 1) return;
+
+ if (*alpha == 0){
+ SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+ return;
+ }
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+ if (incx == 1)
+ VSE_V_F(PREC, LMUL) (x, xvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl);
+
+ x += vl * incx;
+ avl -= vl;
+ }
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
new file mode 100644
index 0000000000..ef9091f16c
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
@@ -0,0 +1,204 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx, const cntx_t *cntx) {
+ (void)conjalpha;
+ (void)cntx;
+ const float* restrict alpha = alpha_;
+ float* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+ :
+ : "r"(n), "i"(8 * FLT_SIZE));
+ __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+ incx *= FLT_SIZE;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx, const cntx_t *cntx) {
+ (void)conjalpha;
+ (void)cntx;
+ const double* restrict alpha = alpha_;
+ double* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+ :
+ : "r"(n), "i"(8 * FLT_SIZE));
+ __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+ incx *= FLT_SIZE;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx, const cntx_t *cntx) {
+ (void)cntx;
+ const scomplex* restrict alpha = alpha_;
+ scomplex* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+ :
+ : "r"(n), "i"(8 * FLT_SIZE));
+ __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+ __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(VLSE "v4, (t0), zero");
+ if (conjalpha == BLIS_CONJUGATE)
+ __asm__("vfneg.v v4, v4");
+ incx *= 2 * FLT_SIZE;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+ void * restrict x_, inc_t incx, const cntx_t *cntx) {
+ (void)cntx;
+ const dcomplex* restrict alpha = alpha_;
+ dcomplex* restrict x = x_;
+ if (n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+ :
+ : "r"(n), "i"(8 * FLT_SIZE));
+ __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+ __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(VLSE "v4, (t0), zero");
+ if (conjalpha == BLIS_CONJUGATE)
+ __asm__("vfneg.v v4, v4");
+ incx *= 2 * FLT_SIZE;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ avl -= vl;
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
new file mode 100644
index 0000000000..e6b483a3f8
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict x_, inc_t incx, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* cntx \
+)
+
+#define SUBV(...) SUBV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SUBV
+#undef SUBV_
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..2d4a1a017f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+ // Computes y := y - conjx(x)
+ (void) cntx;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ yvec_real = VFSUB_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE)
+ yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+ else
+ yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+ yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..b158594319
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
@@ -0,0 +1,77 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+ // Computes y = y - conjx(x)
+ // == y - x (real case)
+ (void) cntx;
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ yvec = VFSUB_VV(PREC, LMUL)(yvec, xvec, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL) (y, yvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
new file mode 100644
index 0000000000..2342e254a2
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
@@ -0,0 +1,245 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_,
+ inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ float* restrict x = x_;
+ float* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == FLT_SIZE)
+ __asm__(VLE "v8, (%0)" : : "r"(y));
+ else
+ __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+ if (incx == FLT_SIZE)
+ __asm__(VSE "v8, (%0)" : : "r"(x));
+ else
+ __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ double* restrict x = x_;
+ double* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == FLT_SIZE)
+ __asm__(VLE "v8, (%0)" : : "r"(y));
+ else
+ __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+ if (incx == FLT_SIZE)
+ __asm__(VSE "v8, (%0)" : : "r"(x));
+ else
+ __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ scomplex* restrict x = x_;
+ scomplex* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VLE "v8, (%0)" : : "r"(y));
+ else
+ __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VSE "v8, (%0)" : : "r"(x));
+ else
+ __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+ void * restrict y_, inc_t incy, const cntx_t *cntx) {
+ (void)cntx;
+ dcomplex* restrict x = x_;
+ dcomplex* restrict y = y_;
+ if (n <= 0)
+ return;
+
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+ else
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v8, (%0)" : : "r"(y));
+ else
+ __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+ else
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+ if (incy == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..dce4085bff
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\
+ conj_t conjx, \
+ dim_t n, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict beta_, \
+ T* restrict y_, inc_t incy, \
+ const cntx_t* restrict cntx \
+)
+
+#define XPBYV(...) XPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef XPBYV
+#undef XPBYV_
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..4c86e8b36a
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,101 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+ // Computes y = beta * y + conjx(x)
+ const DATATYPE* restrict beta = beta_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ if (beta->real == 0 && beta->imag == 0){
+ COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+ return;
+ }
+
+ // TO DO (optimization): beta = +-1, +-i special cases
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+ // xpbyv is computed with FMAs as follows:
+ // y[i].real = ( x[i].real + beta.real * y[i].real) - beta.imag * y[i].imag
+ // y[i].imag = (conjx(x[i].imag + beta.imag * y[i].real) + beta.real * y[i].imag
+
+ xvec_real = VFMACC_VF( PREC, LMUL)(xvec_real, beta->real, yvec_real, vl);
+ xvec_real = VFNMSAC_VF(PREC, LMUL)(xvec_real, beta->imag, yvec_imag, vl);
+ if (conjx == BLIS_NO_CONJUGATE)
+ xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+ else
+ xvec_imag = VFMSAC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+ xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->real, yvec_imag, vl);
+
+ xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real);
+ xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag);
+
+ if (incy == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, xvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, xvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ avl -= vl;
+ }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..b23272fea4
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,84 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+ // Computes y = beta * y + conjx(x)
+ // == beta * y + x (real case)
+ (void) conjx; // Suppress unused parameter warnings
+ const DATATYPE* restrict beta = beta_;
+ const DATATYPE* restrict x = x_;
+ DATATYPE* restrict y = y_;
+
+ if (n <= 0) return;
+
+ if (*beta == 0){
+ COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+ return;
+ }
+
+ // TO DO (optimization): beta = +-1 special cases
+
+ size_t avl = n;
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL) (x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL) (y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ yvec = VFMADD_VF(PREC, LMUL)(yvec, *beta, xvec, vl);
+
+ if (incy == 1)
+ VSE_V_F(PREC, LMUL) (y, yvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+ x += vl * incx;
+ y += vl * incy;
+ avl -= vl;
+ }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
new file mode 100644
index 0000000000..1b5ce3b962
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\
+ conj_t conjx, \
+ conj_t conjy, \
+ dim_t n, \
+ const T* restrict alphax_, \
+ const T* restrict alphay_, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict y_, inc_t incy, \
+ T* restrict z_, inc_t incz, \
+ const cntx_t* restrict cntx \
+)
+
+#define AXPY2V(...) AXPY2V_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPY2V
+#undef AXPY2V_
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..9b57198272
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
@@ -0,0 +1,117 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+ // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+ const DATATYPE* restrict alphax = alphax_;
+ const DATATYPE* restrict alphay = alphay_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+ DATATYPE* restrict z = z_;
+
+ if (n <= 0)
+ return;
+
+ size_t avl = n;
+
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ if (incz == 1)
+ zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+ else
+ zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+ zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+ zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+ // + alphax * conjx(x)
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->real, xvec_real, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->imag, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE){
+ zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphax->imag, xvec_imag, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->real, xvec_imag, vl);
+ } else {
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->imag, xvec_imag, vl);
+ zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphax->real, xvec_imag, vl);
+ }
+
+ // + alphay * conjy(y)
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->real, yvec_real, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->imag, yvec_real, vl);
+ if (conjy == BLIS_NO_CONJUGATE){
+ zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphay->imag, yvec_imag, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->real, yvec_imag, vl);
+ } else {
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->imag, yvec_imag, vl);
+ zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphay->real, yvec_imag, vl);
+ }
+
+ zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+ zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+ if (incz == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ z += vl*incz;
+ avl -= vl;
+ }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..cebb159973
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
@@ -0,0 +1,91 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+ // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+ // == z + alphax * x + alphay * y (real case)
+
+ (void) conjx; // Suppress unused parameter warnings
+ (void) conjy;
+ const DATATYPE* restrict alphax = alphax_;
+ const DATATYPE* restrict alphay = alphay_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+ DATATYPE* restrict z = z_;
+
+ if (n <= 0)
+ return;
+
+ size_t avl = n;
+
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL)(x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL)(y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ if (incz == 1)
+ zvec = VLE_V_F(PREC, LMUL)(z, vl);
+ else
+ zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+ zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, xvec, vl);
+ zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphay, yvec, vl);
+
+ if (incz == 1)
+ VSE_V_F(PREC, LMUL)(z, zvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ z += vl*incz;
+ avl -= vl;
+ }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
new file mode 100644
index 0000000000..43c2ba44e2
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
@@ -0,0 +1,430 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+ const void *restrict alpha_, const void *restrict a_, inc_t inca,
+ inc_t lda, const void *restrict x_, inc_t incx,
+ void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+ (void)conja;
+ (void)conjx;
+ (void)cntx;
+ const float *restrict alpha = alpha_;
+ const float *restrict a = a_;
+ const float *restrict x = x_;
+ float *restrict y = y_;
+
+ if (m == 0 || b == 0)
+ return;
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = m;
+ while (avl) {
+ // process vl elements of y at a time
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ // x_tmp traverses x
+ // a points to the vl x b block of a needed this iteration
+ // a_tmp traverses the columns of this block
+ const float* restrict x_tmp = x;
+ const float* restrict a_tmp = a;
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ if (inca == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+ for (dim_t i = 1; i < b; ++i) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ if (inca == FLT_SIZE)
+ __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmacc.vf v0, ft0, v24");
+ }
+
+ if (incy == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(y));
+ __asm__("vfmacc.vf v24, ft11, v0");
+ __asm__(VSE "v24, (%0)" : : "r"(y));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+ __asm__("vfmacc.vf v24, ft11, v0");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+ const void *restrict alpha_, const void *restrict a_, inc_t inca,
+ inc_t lda, const void *restrict x_, inc_t incx,
+ void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+ (void)conja;
+ (void)conjx;
+ (void)cntx;
+ const double *restrict alpha = alpha_;
+ const double *restrict a = a_;
+ const double *restrict x = x_;
+ double *restrict y = y_;
+
+ if (m == 0 || b == 0)
+ return;
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ size_t avl = m;
+ while (avl) {
+ // process vl elements of y at a time
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ // x_tmp traverses x
+ // a points to the vl x b block of a needed this iteration
+ // a_tmp traverses the columns of this block
+ const double* restrict x_tmp = x;
+ const double* restrict a_tmp = a;
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ if (inca == FLT_SIZE)
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+ for (dim_t i = 1; i < b; ++i) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ if (inca == FLT_SIZE)
+ __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmacc.vf v0, ft0, v24");
+ }
+
+ if (incy == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(y));
+ __asm__("vfmacc.vf v24, ft11, v0");
+ __asm__(VSE "v24, (%0)" : : "r"(y));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+ __asm__("vfmacc.vf v24, ft11, v0");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG "vlseg2e32.v "
+#define VLSSEG "vlsseg2e32.v "
+#define VSSEG "vsseg2e32.v "
+#define VSSSEG "vssseg2e32.v "
+
+void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+ const void *restrict alpha_, const void *restrict a_,
+ inc_t inca, inc_t lda, const void *restrict x_,
+ inc_t incx, void *restrict y_, inc_t incy,
+ const cntx_t *restrict cntx) {
+ (void)cntx;
+ const scomplex *restrict alpha = alpha_;
+ const scomplex *restrict a = a_;
+ const scomplex *restrict x = x_;
+ scomplex *restrict y = y_;
+
+ if (m == 0 || b == 0)
+ return;
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ size_t avl = m;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ const scomplex* restrict x_tmp = x;
+ const scomplex* restrict a_tmp = a;
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+ if (inca == 2 * FLT_SIZE)
+ __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmul.vf v0, v24, ft0");
+ __asm__("vfmul.vf v4, v24, ft1");
+ if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfmsac.vf v4, ft0, v28");
+ } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ } else {
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfnmacc.vf v4, ft0, v28");
+ }
+
+ for (dim_t i = 1; i < b; ++i) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+ if (inca == 2 * FLT_SIZE)
+ __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmacc.vf v0, ft0, v24");
+ if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v4, ft1, v24");
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+ __asm__("vfnmsac.vf v4, ft1, v24");
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v4, ft1, v24");
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+ __asm__("vfnmsac.vf v4, ft1, v24");
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ }
+ }
+
+ if (incy == 2 * FLT_SIZE) {
+ __asm__(VLSEG "v24, (%0)" : : "r"(y));
+ __asm__("vfmacc.vf v24, ft10, v0");
+ __asm__("vfmacc.vf v28, ft10, v4");
+ __asm__("vfnmsac.vf v24, ft11, v4");
+ __asm__("vfmacc.vf v28, ft11, v0");
+ __asm__(VSSEG "v24, (%0)" : : "r"(y));
+ } else {
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+ __asm__("vfmacc.vf v24, ft10, v0");
+ __asm__("vfmacc.vf v28, ft10, v4");
+ __asm__("vfnmsac.vf v24, ft11, v4");
+ __asm__("vfmacc.vf v28, ft11, v0");
+ __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG
+#undef VLSSEG
+#undef VSSEG
+#undef VSSSEG
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG "vlseg2e64.v "
+#define VLSSEG "vlsseg2e64.v "
+#define VSSEG "vsseg2e64.v "
+#define VSSSEG "vssseg2e64.v "
+
+void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+ const void *restrict alpha_, const void *restrict a_,
+ inc_t inca, inc_t lda, const void *restrict x_,
+ inc_t incx, void *restrict y_, inc_t incy,
+ const cntx_t *restrict cntx) {
+ (void)cntx;
+ const dcomplex *restrict alpha = alpha_;
+ const dcomplex *restrict a = a_;
+ const dcomplex *restrict x = x_;
+ dcomplex *restrict y = y_;
+
+ if (m == 0 || b == 0)
+ return;
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ size_t avl = m;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+ : "=r"(vl)
+ : "r"(avl), "i"(8 * FLT_SIZE));
+ const dcomplex* restrict x_tmp = x;
+ const dcomplex* restrict a_tmp = a;
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+ if (inca == 2 * FLT_SIZE)
+ __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmul.vf v0, v24, ft0");
+ __asm__("vfmul.vf v4, v24, ft1");
+ if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfmsac.vf v4, ft0, v28");
+ } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ } else {
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfnmacc.vf v4, ft0, v28");
+ }
+
+ for (dim_t i = 1; i < b; ++i) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+ if (inca == 2 * FLT_SIZE)
+ __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+ else
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+ __asm__("vfmacc.vf v0, ft0, v24");
+ if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v4, ft1, v24");
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+ __asm__("vfnmsac.vf v4, ft1, v24");
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfmacc.vf v4, ft0, v28");
+ } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+ __asm__("vfmacc.vf v4, ft1, v24");
+ __asm__("vfmacc.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+ __asm__("vfnmsac.vf v4, ft1, v24");
+ __asm__("vfnmsac.vf v0, ft1, v28");
+ __asm__("vfnmsac.vf v4, ft0, v28");
+ }
+ }
+
+ if (incy == 2 * FLT_SIZE) {
+ __asm__(VLSEG "v24, (%0)" : : "r"(y));
+ __asm__("vfmacc.vf v24, ft10, v0");
+ __asm__("vfmacc.vf v28, ft10, v4");
+ __asm__("vfnmsac.vf v24, ft11, v4");
+ __asm__("vfmacc.vf v28, ft11, v0");
+ __asm__(VSSEG "v24, (%0)" : : "r"(y));
+ } else {
+ __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+ __asm__("vfmacc.vf v24, ft10, v0");
+ __asm__("vfmacc.vf v28, ft10, v4");
+ __asm__("vfnmsac.vf v24, ft11, v4");
+ __asm__("vfmacc.vf v28, ft11, v0");
+ __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+ avl -= vl;
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..9cd1071d7a
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include
+#include
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\
+ conj_t conjxt, \
+ conj_t conjx, \
+ conj_t conjy, \
+ dim_t n, \
+ const T* restrict alpha_, \
+ const T* restrict x_, inc_t incx, \
+ const T* restrict y_, inc_t incy, \
+ T* restrict rho_, \
+ T* restrict z_, inc_t incz, \
+ const cntx_t* restrict cntx \
+)
+
+#define DOTAXPYV(...) DOTAXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTAXPYV
+#undef DOTAXPYV_
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..c3cd06c523
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,151 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+ // Computes z := z + alpha * conjx(x)
+ // and rho := conjxt(x)^T * conjy(y)
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+ DATATYPE* restrict rho = rho_;
+ DATATYPE* restrict z = z_;
+
+ if (n <= 0)
+ return;
+
+ size_t avl = n;
+ bool first = true;
+ RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+ RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+ // Loads
+ if (incx == 1)
+ xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+ else
+ xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+ if (incy == 1)
+ yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+ else
+ yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+ if (incz == 1)
+ zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+ else
+ zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+ xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+ xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+ yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+ yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+ zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+ zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+ // z := z + alpha * conjx(x)
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->real, xvec_real, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->imag, xvec_real, vl);
+ if (conjx == BLIS_NO_CONJUGATE){
+ zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alpha->imag, xvec_imag, vl);
+ zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->real, xvec_imag, vl);
+ } else {
+ zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->imag, xvec_imag, vl);
+ zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alpha->real, xvec_imag, vl);
+ }
+
+ // rho := conjxt(x)^T * conjy(y)
+ // We accumulate the current term of the dot product as (a*c-b*d) + (a*d+b*c)*i,
+ // conjugating when necessary
+ if (first) {
+ // Initialize real part: a*c
+ acc_real = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_real, vl);
+ // Initialize imaginary part: a*d
+ acc_imag = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_imag, vl);
+ if (conjy == BLIS_CONJUGATE)
+ acc_imag = VFNEG_VF(PREC, LMUL)(acc_imag, vl); // TO DO: eliminate this negation
+ first = false;
+ } else {
+ // Accumulate real part: a*c
+ acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_real, yvec_real, vl);
+ // Accumulate imaginary part: a*d
+ if (conjy == BLIS_NO_CONJUGATE)
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+ else
+ acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+ }
+ // Finish real part: b*d
+ if (conjxt == BLIS_NO_CONJUGATE ^ conjy == BLIS_NO_CONJUGATE)
+ // Exactly one is conjugated => add
+ acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+ else
+ acc_real = VFNMSAC_VV_TU(PREC,LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+ // Finish imaginary part: b*c
+ if (conjxt == BLIS_NO_CONJUGATE)
+ acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+ else
+ acc_imag = VFNMSAC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+
+ // Stores
+ zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+ zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+ if (incz == 1)
+ VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+ else
+ VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ z += vl*incz;
+ avl -= vl;
+ }
+
+ // Compute rho
+ RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)( 0.f, 1);
+ RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)( 0.f, 1);
+ sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+ sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+ rho->real = VFMV_F_S(PREC)(sum_real);
+ rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..adaf3610b0
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
@@ -0,0 +1,111 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+ // Computes z := z + alpha * conjx(x)
+ // == z + alphax * x (real case)
+ // and rho := conjxt(x)^T * conjy(y)
+ // == x^T * y (real case)
+
+ (void) conjx; // Suppress unused parameter warnings
+ (void) conjxt;
+ (void) conjy;
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict x = x_;
+ const DATATYPE* restrict y = y_;
+ DATATYPE* restrict rho = rho_;
+ DATATYPE* restrict z = z_;
+
+ if (n <= 0)
+ return;
+
+ size_t avl = n;
+ bool first = true;
+ RVV_TYPE_F(PREC, LMUL) acc;
+
+ while (avl) {
+ size_t vl = VSETVL(PREC, LMUL)(avl);
+ RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+ // Loads
+ if (incx == 1)
+ xvec = VLE_V_F(PREC, LMUL)(x, vl);
+ else
+ xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+ if (incy == 1)
+ yvec = VLE_V_F(PREC, LMUL)(y, vl);
+ else
+ yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+ if (incz == 1)
+ zvec = VLE_V_F(PREC, LMUL)(z, vl);
+ else
+ zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+ // z := z + alphax * x
+ zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, xvec, vl);
+
+ // rho := x^T * y
+ if (first){
+ acc = VFMUL_VV(PREC, LMUL)( xvec, yvec, vl);
+ first = false;
+ } else {
+ acc = VFMACC_VV_TU(PREC, LMUL)( acc, xvec, yvec, vl);
+ }
+
+ // Store
+ if (incz == 1)
+ VSE_V_F(PREC, LMUL)(z, zvec, vl);
+ else
+ VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+
+ x += vl*incx;
+ y += vl*incy;
+ z += vl*incz;
+ avl -= vl;
+ }
+
+ // Compute rho
+ RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)( 0.f, 1);
+ sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+ *rho = VFMV_F_S(PREC)(sum);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
new file mode 100644
index 0000000000..ecb340707b
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
@@ -0,0 +1,3120 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxaxpyf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conja,
+ conj_t conjw,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict w_, inc_t incw,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ void* restrict z_, inc_t incz,
+ const cntx_t* restrict cntx
+ ) {
+ (void)conjat;
+ (void)conja;
+ (void)conjw;
+ (void)conjx;
+ (void)cntx;
+ const float *restrict alpha = alpha_;
+ const float *restrict beta = beta_;
+ const float *restrict a = a_;
+ const float *restrict w = w_;
+ const float *restrict x = x_;
+ float *restrict y = y_;
+ float *restrict z = z_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || *alpha == 0.f) {
+ // scale y by beta
+ if (*beta == 0.f)
+ bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incw *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ incz *= FLT_SIZE;
+ inc_t a_bump = 5 * lda;
+ while (b >= 5) {
+ // compute dot product of w with 5 rows of a
+ const float* restrict w_tmp = w;
+ const float* restrict z_tmp = z;
+ const float* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const float* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmul.vv v0, v24, v28");
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmul.vv v16, v24, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmacc.vv v16, v24, v28");
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmul.vv v0, v24, v28");
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmul.vv v16, v24, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmacc.vv v16, v24, v28");
+ }
+ } // end a non-unit stride
+
+ if (incz == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 5;
+ }
+
+ if (b > 0) {
+ const float* restrict w_tmp = w;
+ const float* restrict z_tmp = z;
+ const float* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const float* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ __asm__("vmv.v.i v20, 0");
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmul.vv v0, v24, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmul.vv v0, v24, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a non-unit stride
+
+ if (incz == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ }
+ } // end cleanup
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxaxpyf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conja,
+ conj_t conjw,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict w_, inc_t incw,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ void* restrict z_, inc_t incz,
+ const cntx_t* restrict cntx
+ ) {
+ (void)conjat;
+ (void)conja;
+ (void)conjw;
+ (void)conjx;
+ (void)cntx;
+ const double *restrict alpha = alpha_;
+ const double *restrict beta = beta_;
+ const double *restrict a = a_;
+ const double *restrict w = w_;
+ const double *restrict x = x_;
+ double *restrict y = y_;
+ double *restrict z = z_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || *alpha == 0.) {
+ // scale y by beta
+ if (*beta == 0.)
+ bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incw *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ incz *= FLT_SIZE;
+ inc_t a_bump = 5 * lda;
+ while (b >= 5) {
+ // compute dot product of w with 5 rows of a
+ const double* restrict w_tmp = w;
+ const double* restrict z_tmp = z;
+ const double* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const double* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmul.vv v0, v24, v28");
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmul.vv v16, v24, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmacc.vv v16, v24, v28");
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmul.vv v0, v24, v28");
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmul.vv v16, v24, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vf v20, v24, ft0");
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft4, v24");
+ __asm__("vfmacc.vv v16, v24, v28");
+ }
+ } // end a non-unit stride
+
+ if (incz == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 5;
+ }
+
+ if (b > 0) {
+ const double* restrict w_tmp = w;
+ const double* restrict z_tmp = z;
+ const double* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const double* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ __asm__("vmv.v.i v20, 0");
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmul.vv v0, v24, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmul.vv v12, v24, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmul.vv v8, v24, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmul.vv v4, v24, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmul.vv v0, v24, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft3, v24");
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft2, v24");
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vf v20, ft1, v24");
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vf v20, ft0, v24");
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a non-unit stride
+
+ if (incz == FLT_SIZE) {
+ __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+ } else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ __asm__("vfmacc.vf v24, ft10, v20");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ }
+ } // end cleanup
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxaxpyf_sifive_x280_asm
+ (
+ conj_t conjat,
+ conj_t conja,
+ conj_t conjw,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict w_, inc_t incw,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ void* restrict z_, inc_t incz,
+ const cntx_t* restrict cntx
+ )
+{
+ (void)cntx;
+ const scomplex *restrict alpha = alpha_;
+ const scomplex *restrict beta = beta_;
+ const scomplex *restrict a = a_;
+ const scomplex *restrict w = w_;
+ const scomplex *restrict x = x_;
+ scomplex *restrict y = y_;
+ scomplex *restrict z = z_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+ // scale y by beta
+ if (beta->real == 0.f && beta->imag == 0.f)
+ bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+ // and fa6-fa7 to store beta
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+ __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+ // Reduce to case when A^T is not conjugated, then conjugate
+ // computed product A^T * w if needed.
+ conj_t conjatw = BLIS_NO_CONJUGATE;
+ if (conjat == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conjat);
+ bli_toggle_conj(&conjw);
+ bli_toggle_conj(&conjatw);
+ }
+ conj_t conjax = BLIS_NO_CONJUGATE;
+ if (conja == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conja);
+ bli_toggle_conj(&conjx);
+ bli_toggle_conj(&conjax);
+ }
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incw *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ incz *= 2 * FLT_SIZE;
+ // these are used to bump a and y, resp.
+ inc_t a_bump = 5 * lda;
+ inc_t y_bump = incy - FLT_SIZE;
+ while (b >= 5) {
+ // compute dot product of w with 6 rows of a
+ const scomplex* restrict w_tmp = w;
+ const scomplex* restrict z_tmp = z;
+ const scomplex* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const scomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjw = no conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a unit stride, conjw = conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjw = no conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a non-unit stride, conjw = conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a non-unit stride
+
+ if (incz == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ // a += 5 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 5;
+ }
+
+ if (b > 0) {
+ // cleanup loop, 0 < b < 5
+ const scomplex* restrict w_tmp = w;
+ const scomplex* restrict z_tmp = z;
+ const scomplex* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const scomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v22, 0");
+ if (inca == 2 * FLT_SIZE) {
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjw = no conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a unit stride, conjw = conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjw = no conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a non-unit stride, conjw = conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a non-unit stride
+
+ if (incz == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ y_bump = incy + FLT_SIZE;
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxaxpyf_sifive_x280_asm
+ (
+ conj_t conjat,
+ conj_t conja,
+ conj_t conjw,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict w_, inc_t incw,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ void* restrict z_, inc_t incz,
+ const cntx_t* restrict cntx
+ )
+{
+ (void)cntx;
+ const dcomplex *restrict alpha = alpha_;
+ const dcomplex *restrict beta = beta_;
+ const dcomplex *restrict a = a_;
+ const dcomplex *restrict w = w_;
+ const dcomplex *restrict x = x_;
+ dcomplex *restrict y = y_;
+ dcomplex *restrict z = z_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+ // scale y by beta
+ if (beta->real == 0. && beta->imag == 0.)
+ bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+ // and fa6-fa7 to store beta
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+ __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+ // Reduce to case when A^T is not conjugated, then conjugate
+ // computed product A^T * w if needed.
+ conj_t conjatw = BLIS_NO_CONJUGATE;
+ if (conjat == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conjat);
+ bli_toggle_conj(&conjw);
+ bli_toggle_conj(&conjatw);
+ }
+ conj_t conjax = BLIS_NO_CONJUGATE;
+ if (conja == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conja);
+ bli_toggle_conj(&conjx);
+ bli_toggle_conj(&conjax);
+ }
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incw *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ incz *= 2 * FLT_SIZE;
+ // these are used to bump a and y, resp.
+ inc_t a_bump = 5 * lda;
+ inc_t y_bump = incy - FLT_SIZE;
+ while (b >= 5) {
+ // compute dot product of w with 6 rows of a
+ const dcomplex* restrict w_tmp = w;
+ const dcomplex* restrict z_tmp = z;
+ const dcomplex* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const dcomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjw = no conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a unit stride, conjw = conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjw = no conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a non-unit stride, conjw = conj
+ if (first) {
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a non-unit stride
+
+ if (incz == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ // a += 5 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 5;
+ }
+
+ if (b > 0) {
+ // cleanup loop, 0 < b < 5
+ const dcomplex* restrict w_tmp = w;
+ const dcomplex* restrict z_tmp = z;
+ const dcomplex* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const dcomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incw == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v22, 0");
+ if (inca == 2 * FLT_SIZE) {
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjw = no conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a unit stride, conjw = conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjw == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjw = no conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_NO_CONJUGATE
+ else { // conjw == BLIS_CONJUGATE
+ // a non-unit stride, conjw = conj
+ if (first) {
+ switch (b) {
+ case 4:
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+ __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjw == BLIS_CONJUGATE
+ } // end a non-unit stride
+
+ if (incz == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ if (conjax == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+ }
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+ __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ y_bump = incy + FLT_SIZE;
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatw == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ }
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
new file mode 100644
index 0000000000..5ac2d41667
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
@@ -0,0 +1,2645 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ const cntx_t* restrict cntx
+ ) {
+ // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+ // we process 6 elements of y per iteration, using y_tmp to load/store from
+ // y a points to the 6 x m block of a needed this iteration each 6 x m block
+ // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+ // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+ // 6 x vl block
+ (void)conjat;
+ (void)conjx;
+ (void)cntx;
+ const float* restrict alpha = alpha_;
+ const float* restrict a = a_;
+ const float* restrict x = x_;
+ const float* restrict beta = beta_;
+ float* restrict y = y_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || *alpha == 0.f) {
+ // scale y by beta
+ if (*beta == 0.f)
+ bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+ while (b >= 6) {
+ // compute dot product of x with 6 rows of a
+ const float* restrict x_tmp = x;
+ const float* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const float* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ __asm__(VLE "v0, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v0, v0, v28");
+ __asm__(VLE "v4, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ __asm__(VLE "v8, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ __asm__(VLE "v12, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ __asm__(VLE "v16, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ __asm__(VLE "v20, (%0)" : : "r"(a_row));
+ __asm__("vfmul.vv v20, v20, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vv v20, v24, v28");
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v0, v0, v28");
+ __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmul.vv v20, v20, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vv v20, v24, v28");
+ }
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v20, v20, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__(VSE "v20, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v20");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ // a += 6 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 6;
+ }
+
+ if (b > 0) {
+ // compute dot product of x with remaining < 6 rows of a
+ const float* restrict x_tmp = x;
+ // a_col will move along the last row of a!
+ const float* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const float* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLE "v16, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ case 4:
+ __asm__(VLE "v12, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ case 3:
+ __asm__(VLE "v8, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ case 2:
+ __asm__(VLE "v4, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_row));
+ __asm__("vfmul.vv v0, v0, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ case 4:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ case 4:
+ __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ case 3:
+ __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ case 2:
+ __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ case 1:
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmul.vv v0, v0, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ case 4:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ __asm__("vmv.s.x v31, x0");
+ switch (b) {
+ case 5:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ }
+ } // end cleanup
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ const cntx_t* restrict cntx
+ ) {
+ // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+ // we process 6 elements of y per iteration, using y_tmp to load/store from
+ // y a points to the 6 x m block of a needed this iteration each 6 x m block
+ // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+ // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+ // 6 x vl block
+ (void)conjat;
+ (void)conjx;
+ (void)cntx;
+ const double* restrict alpha = alpha_;
+ const double* restrict a = a_;
+ const double* restrict x = x_;
+ const double* restrict beta = beta_;
+ double* restrict y = y_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || *alpha == 0.) {
+ // scale y by beta
+ if (*beta == 0.)
+ bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ inca *= FLT_SIZE;
+ lda *= FLT_SIZE;
+ incx *= FLT_SIZE;
+ incy *= FLT_SIZE;
+ inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+ while (b >= 6) {
+ // compute dot product of x with 6 rows of a
+ const double* restrict x_tmp = x;
+ const double* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const double* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ __asm__(VLE "v0, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v0, v0, v28");
+ __asm__(VLE "v4, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ __asm__(VLE "v8, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ __asm__(VLE "v12, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ __asm__(VLE "v16, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ __asm__(VLE "v20, (%0)" : : "r"(a_row));
+ __asm__("vfmul.vv v20, v20, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vv v20, v24, v28");
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v0, v0, v28");
+ __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmul.vv v20, v20, v28");
+ first = false;
+ }
+ else {
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v0, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vv v20, v24, v28");
+ }
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v20, v20, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__(VSE "v20, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v20");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+ // a += 6 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 6;
+ }
+
+ if (b > 0) {
+ // compute dot product of x with remaining < 6 rows of a
+ const double* restrict x_tmp = x;
+ // a_col will move along the last row of a!
+ const double* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const double* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == FLT_SIZE)
+ __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == FLT_SIZE) {
+ // a unit stride
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLE "v16, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ case 4:
+ __asm__(VLE "v12, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ case 3:
+ __asm__(VLE "v8, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ case 2:
+ __asm__(VLE "v4, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_row));
+ __asm__("vfmul.vv v0, v0, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ case 4:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLE "v24, (%0)" : : "r"(a_row));
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a unit stride
+ else {
+ // a non-unit stride
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v16, v16, v28");
+ case 4:
+ __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v12, v12, v28");
+ case 3:
+ __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v8, v8, v28");
+ case 2:
+ __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmul.vv v4, v4, v28");
+ case 1:
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmul.vv v0, v0, v28");
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v16, v24, v28");
+ case 4:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v12, v24, v28");
+ case 3:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v8, v24, v28");
+ case 2:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ __asm__("vfmacc.vv v4, v24, v28");
+ case 1:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("vfmacc.vv v0, v24, v28");
+ }
+ }
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ __asm__("vmv.s.x v31, x0");
+ switch (b) {
+ case 5:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__(VSE "v16, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v16");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__(VSE "v12, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v12");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__(VSE "v8, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v8");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__(VSE "v4, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v4");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__(VSE "v0, (%0)" : : "r"(y));
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+ __asm__(FMUL "ft0, ft11, ft0");
+ __asm__("vfmv.s.f v30, ft0");
+ __asm__("vfmacc.vf v30, ft10, v0");
+ __asm__(VSE "v30, (%0)" : : "r"(y));
+ }
+ }
+ } // end cleanup
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ const cntx_t* restrict cntx
+ ) {
+ (void)cntx;
+ const scomplex* restrict alpha = alpha_;
+ const scomplex* restrict a = a_;
+ const scomplex* restrict x = x_;
+ const scomplex* restrict beta = beta_;
+ scomplex* restrict y = y_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+ // scale y by beta
+ if (beta->real == 0.f && beta->imag == 0.f)
+ bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+ // Reduce to case when A^T is not conjugated, then conjugate
+ // computed product A^T * x if needed.
+ conj_t conjatx = BLIS_NO_CONJUGATE;
+ if (conjat == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conjat);
+ bli_toggle_conj(&conjx);
+ bli_toggle_conj(&conjatx);
+ }
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ // these are used to bump a and y, resp.
+ inc_t a_bump = 6 * lda;
+ inc_t y_bump = incy - FLT_SIZE;
+ while (b >= 6) {
+ // compute dot product of x with 6 rows of a
+ const scomplex* restrict x_tmp = x;
+ const scomplex* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const scomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjx = no conj
+ if (first) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a unit stride, conjx = conj
+ if (first) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjx = no conj
+ if (first) {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx = BLIS_CONJUGATE
+ // a non-unit stride, conjx = conj
+ if (first) {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v20, v20, v31");
+ __asm__("vfredusum.vs v22, v22, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ // a += 6 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 6;
+ }
+
+ if (b > 0) {
+ // cleanup loop, 0 < b < 6
+ const scomplex* restrict x_tmp = x;
+ const scomplex* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const scomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjx = no conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a unit stride, conjx = conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjx = no conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a non-unit stride, conjx = conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ y_bump = incy + FLT_SIZE;
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 5:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0.f && beta->imag == 0.f) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ }
+ } // end cleanup
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxf_sifive_x280_asm(
+ conj_t conjat,
+ conj_t conjx,
+ dim_t m,
+ dim_t b,
+ const void* restrict alpha_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ const void* restrict x_, inc_t incx,
+ const void* restrict beta_,
+ void* restrict y_, inc_t incy,
+ const cntx_t* restrict cntx
+ ) {
+ (void)cntx;
+ const dcomplex* restrict alpha = alpha_;
+ const dcomplex* restrict a = a_;
+ const dcomplex* restrict x = x_;
+ const dcomplex* restrict beta = beta_;
+ dcomplex* restrict y = y_;
+
+ if (b == 0)
+ return;
+ else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+ // scale y by beta
+ if (beta->real == 0. && beta->imag == 0.)
+ bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ else
+ bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+ return;
+ }
+
+ __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+ // Reduce to case when A^T is not conjugated, then conjugate
+ // computed product A^T * x if needed.
+ conj_t conjatx = BLIS_NO_CONJUGATE;
+ if (conjat == BLIS_CONJUGATE) {
+ bli_toggle_conj(&conjat);
+ bli_toggle_conj(&conjx);
+ bli_toggle_conj(&conjatx);
+ }
+ inca *= 2 * FLT_SIZE;
+ lda *= 2 * FLT_SIZE;
+ incx *= 2 * FLT_SIZE;
+ incy *= 2 * FLT_SIZE;
+ // these are used to bump a and y, resp.
+ inc_t a_bump = 6 * lda;
+ inc_t y_bump = incy - FLT_SIZE;
+ while (b >= 6) {
+ // compute dot product of x with 6 rows of a
+ const dcomplex* restrict x_tmp = x;
+ const dcomplex* restrict a_col = a;
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const dcomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjx = no conj
+ if (first) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a unit stride, conjx = conj
+ if (first) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjx = no conj
+ if (first) {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx = BLIS_CONJUGATE
+ // a non-unit stride, conjx = conj
+ if (first) {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+ first = false;
+ }
+ else {
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("vmv.s.x v31, x0");
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v20, v20, v31");
+ __asm__("vfredusum.vs v22, v22, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+ // a += 6 * lda;
+ __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+ b -= 6;
+ }
+
+ if (b > 0) {
+ // cleanup loop, 0 < b < 6
+ const dcomplex* restrict x_tmp = x;
+ const dcomplex* restrict a_col;
+ __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+ size_t avl = m;
+ bool first = true;
+ while (avl) {
+ const dcomplex* restrict a_row = a_col;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ if (incx == 2 * FLT_SIZE)
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+ else
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+ if (inca == 2 * FLT_SIZE) {
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a unit stride, conjx = no conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a unit stride, conjx = conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a unit stride
+ else { // a non-unit stride
+ if (conjx == BLIS_NO_CONJUGATE) {
+ // a non-unit stride, conjx = no conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_NO_CONJUGATE
+ else { // conjx == BLIS_CONJUGATE
+ // a non-unit stride, conjx = conj
+ if (first) {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ first = false;
+ }
+ else {
+ switch (b) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+ case 4:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+ case 2:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+ vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+ vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+ }
+ }
+ } // end conjx == BLIS_CONJUGATE
+ } // end a non-unit stride
+ __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+ __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+ avl -= vl;
+ }
+
+ __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+ y_bump = incy + FLT_SIZE;
+ __asm__("vmv.s.x v31, x0");
+
+ switch (b) {
+ case 5:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v16, v16, v31");
+ __asm__("vfredusum.vs v18, v18, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 4:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v12, v12, v31");
+ __asm__("vfredusum.vs v14, v14, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 3:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v8, v8, v31");
+ __asm__("vfredusum.vs v10, v10, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 2:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v4, v4, v31");
+ __asm__("vfredusum.vs v6, v6, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+ case 1:
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+ __asm__("vfredusum.vs v0, v0, v31");
+ __asm__("vfredusum.vs v2, v2, v31");
+ __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ if (beta->real == 0. && beta->imag == 0.) {
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+ }
+ else {
+ vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+ }
+ }
+ else {
+ __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+ cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+ __asm__("vfmv.s.f v28, ft0");
+ __asm__("vfmv.s.f v29, ft1");
+ if (conjatx == BLIS_NO_CONJUGATE) {
+ vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+ }
+ else {
+ vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+ }
+ }
+ __asm__(VSE "v28, (%0)" : : "r"(y));
+ __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+ __asm__(VSE "v29, (%0)" : : "r"(y));
+ }
+ } // end cleanup
+ return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
new file mode 100644
index 0000000000..35ca23677d
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
@@ -0,0 +1,678 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSSEG7 "vssseg7e32.v "
+
+void bli_spackm_sifive_x280_asm_7xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) conja;
+ (void) cntx;
+ const float* kappa = kappa_;
+ const float* a = a_;
+ float* p = p_;
+
+ float kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 0: __asm__("vmv.v.i v0, 0");
+ case 1: __asm__("vmv.v.i v1, 0");
+ case 2: __asm__("vmv.v.i v2, 0");
+ case 3: __asm__("vmv.v.i v3, 0");
+ case 4: __asm__("vmv.v.i v4, 0");
+ case 5: __asm__("vmv.v.i v5, 0");
+ case 6: __asm__("vmv.v.i v6, 0");
+ }
+ a += (cdim - 1) * inca;
+ size_t avl = n;
+ while (avl) {
+ const float* a_tmp = a;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 7:
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 6:
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast != 1.f) {
+ switch (cdim) {
+ case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ }
+ __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= FLT_SIZE;
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast != 1.f) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSSEG7 "vssseg7e64.v "
+
+void bli_dpackm_sifive_x280_asm_7xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) conja;
+ (void) cntx;
+ const double* kappa = kappa_;
+ const double* a = a_;
+ double* p = p_;
+
+ double kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 0: __asm__("vmv.v.i v0, 0");
+ case 1: __asm__("vmv.v.i v1, 0");
+ case 2: __asm__("vmv.v.i v2, 0");
+ case 3: __asm__("vmv.v.i v3, 0");
+ case 4: __asm__("vmv.v.i v4, 0");
+ case 5: __asm__("vmv.v.i v5, 0");
+ case 6: __asm__("vmv.v.i v6, 0");
+ }
+ a += (cdim - 1) * inca;
+ size_t avl = n;
+ while (avl) {
+ const double* a_tmp = a;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 7:
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 6:
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast != 1.) {
+ switch (cdim) {
+ case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ }
+ __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= FLT_SIZE;
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast != 1.) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+
+void bli_cpackm_sifive_x280_asm_6xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) cntx;
+ const scomplex* kappa = kappa_;
+ const scomplex* a = a_;
+ scomplex* p = p_;
+
+ scomplex kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ switch (cdim) {
+ case 0:
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ case 1:
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v3, 0");
+ case 2:
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v5, 0");
+ case 3:
+ __asm__("vmv.v.i v6, 0");
+ __asm__("vmv.v.i v7, 0");
+ case 4:
+ __asm__("vmv.v.i v8, 0");
+ __asm__("vmv.v.i v9, 0");
+ case 5:
+ __asm__("vmv.v.i v10, 0");
+ __asm__("vmv.v.i v11, 0");
+ }
+ }
+ else {
+ switch (cdim) {
+ case 0:
+ __asm__("vmv.v.i v12, 0");
+ __asm__("vmv.v.i v13, 0");
+ case 1:
+ __asm__("vmv.v.i v14, 0");
+ __asm__("vmv.v.i v15, 0");
+ case 2:
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v17, 0");
+ case 3:
+ __asm__("vmv.v.i v18, 0");
+ __asm__("vmv.v.i v19, 0");
+ case 4:
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v21, 0");
+ case 5:
+ __asm__("vmv.v.i v22, 0");
+ __asm__("vmv.v.i v23, 0");
+ }
+ }
+ a += (cdim - 1) * inca;
+ size_t avl = n;
+ while (avl) {
+ const scomplex* a_tmp = a;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 6:
+ __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ if (conja == BLIS_CONJUGATE) {
+ switch (cdim) {
+ case 6: __asm__("vfneg.v v11, v11");
+ case 5: __asm__("vfneg.v v9, v9");
+ case 4: __asm__("vfneg.v v7, v7");
+ case 3: __asm__("vfneg.v v5, v5");
+ case 2: __asm__("vfneg.v v3, v3");
+ case 1: __asm__("vfneg.v v1, v1");
+ }
+ }
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+ __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ switch (cdim) {
+ case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+ case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+ case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+ case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ else {
+ switch (cdim) {
+ case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+ case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+ case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+ case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+ __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= 2 * FLT_SIZE;
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v3, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v1, v1");
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+ }
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG6
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+
+void bli_zpackm_sifive_x280_asm_6xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) cntx;
+ const dcomplex* kappa = kappa_;
+ const dcomplex* a = a_;
+ dcomplex* p = p_;
+
+ dcomplex kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ switch (cdim) {
+ case 0:
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ case 1:
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v3, 0");
+ case 2:
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v5, 0");
+ case 3:
+ __asm__("vmv.v.i v6, 0");
+ __asm__("vmv.v.i v7, 0");
+ case 4:
+ __asm__("vmv.v.i v8, 0");
+ __asm__("vmv.v.i v9, 0");
+ case 5:
+ __asm__("vmv.v.i v10, 0");
+ __asm__("vmv.v.i v11, 0");
+ }
+ }
+ else {
+ switch (cdim) {
+ case 0:
+ __asm__("vmv.v.i v12, 0");
+ __asm__("vmv.v.i v13, 0");
+ case 1:
+ __asm__("vmv.v.i v14, 0");
+ __asm__("vmv.v.i v15, 0");
+ case 2:
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v17, 0");
+ case 3:
+ __asm__("vmv.v.i v18, 0");
+ __asm__("vmv.v.i v19, 0");
+ case 4:
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v21, 0");
+ case 5:
+ __asm__("vmv.v.i v22, 0");
+ __asm__("vmv.v.i v23, 0");
+ }
+ }
+ a += (cdim - 1) * inca;
+ size_t avl = n;
+ while (avl) {
+ const dcomplex* a_tmp = a;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ switch (cdim) {
+ case 6:
+ __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ if (conja == BLIS_CONJUGATE) {
+ switch (cdim) {
+ case 6: __asm__("vfneg.v v11, v11");
+ case 5: __asm__("vfneg.v v9, v9");
+ case 4: __asm__("vfneg.v v7, v7");
+ case 3: __asm__("vfneg.v v5, v5");
+ case 2: __asm__("vfneg.v v3, v3");
+ case 1: __asm__("vfneg.v v1, v1");
+ }
+ }
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+ __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ switch (cdim) {
+ case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+ case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+ case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+ case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ else {
+ switch (cdim) {
+ case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+ case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+ case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+ case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+ __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= 2 * FLT_SIZE;
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v3, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v1, v1");
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+ }
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v1, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
new file mode 100644
index 0000000000..89e05ecae3
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
@@ -0,0 +1,838 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include
+#include
+#include
+#include
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define VSSSEG7 "vssseg7e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG5 "vssseg5e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG3 "vssseg3e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define NR 64
+
+void bli_spackm_sifive_x280_asm_64xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) conja;
+ (void) cntx;
+ const float* kappa = kappa_;
+ const float* a = a_;
+ float* p = p_;
+
+ float kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v8, 0");
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ dim_t cdim_tmp = cdim;
+ const float* a_tmp = a;
+ float* p_tmp = p;
+ while (cdim_tmp >= 8) {
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ if (kappa_cast != 1.f) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+ }
+ __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ p_tmp += 8;
+ cdim_tmp -= 8;
+ }
+ if (cdim_tmp > 0) {
+ a_tmp += (cdim_tmp - 1) * inca;
+ switch (cdim_tmp) {
+ case 7:
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 6:
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast != 1.f) {
+ switch (cdim_tmp) {
+ case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ }
+ switch (cdim_tmp) {
+ case 7:
+ __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 6:
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 5:
+ __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 4:
+ __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 3:
+ __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ }
+ p_tmp += cdim_tmp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+ for (size_t i = 0; i < vl; ++i) {
+ __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+ p_tmp += ldp;
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v8, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= FLT_SIZE;
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast != 1.f) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define VSSSEG7 "vssseg7e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG5 "vssseg5e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG3 "vssseg3e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define NR 32
+
+void bli_dpackm_sifive_x280_asm_32xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) conja;
+ (void) cntx;
+ const double* kappa = kappa_;
+ const double* a = a_;
+ double* p = p_;
+
+ double kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v8, 0");
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ dim_t cdim_tmp = cdim;
+ const double* a_tmp = a;
+ double* p_tmp = p;
+ while (cdim_tmp >= 8) {
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ if (kappa_cast != 1.) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+ }
+ __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ p_tmp += 8;
+ cdim_tmp -= 8;
+ }
+ if (cdim_tmp > 0) {
+ a_tmp += (cdim_tmp - 1) * inca;
+ switch (cdim_tmp) {
+ case 7:
+ __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 6:
+ __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 5:
+ __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 4:
+ __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 3:
+ __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast != 1.) {
+ switch (cdim_tmp) {
+ case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+ case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+ case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+ case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+ case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+ case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+ case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ }
+ switch (cdim_tmp) {
+ case 7:
+ __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 6:
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 5:
+ __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 4:
+ __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 3:
+ __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+ break;
+ }
+ p_tmp += cdim_tmp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+ for (size_t i = 0; i < vl; ++i) {
+ __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+ p_tmp += ldp;
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v8, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= FLT_SIZE;
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == FLT_SIZE) {
+ __asm__(VLE "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast != 1.) {
+ __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSE "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define NR 32
+
+void bli_cpackm_sifive_x280_asm_32xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) cntx;
+ const scomplex* kappa = kappa_;
+ const scomplex* a = a_;
+ scomplex* p = p_;
+
+ scomplex kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v18, 0");
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ dim_t cdim_tmp = cdim;
+ const scomplex* a_tmp = a;
+ scomplex* p_tmp = p;
+ while (cdim_tmp >= 4) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v1, v1");
+ __asm__("vfneg.v v3, v3");
+ __asm__("vfneg.v v5, v5");
+ __asm__("vfneg.v v7, v7");
+ }
+ __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ }
+ p_tmp += 4;
+ cdim_tmp -= 4;
+ }
+ if (cdim_tmp > 0) {
+ a_tmp += (cdim_tmp - 1) * inca;
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ if (conja == BLIS_CONJUGATE) {
+ switch (cdim_tmp) {
+ case 3: __asm__("vfneg.v v5, v5");
+ case 2: __asm__("vfneg.v v3, v3");
+ case 1: __asm__("vfneg.v v1, v1");
+ }
+ }
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ }
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ switch (cdim_tmp) {
+ case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ else {
+ switch (cdim_tmp) {
+ case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ }
+ }
+ p_tmp += cdim_tmp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+ for (size_t i = 0; i < vl; ++i) {
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+ p_tmp += ldp;
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= 2 * FLT_SIZE;
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v6, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v2, v2");
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+ }
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSSSEG4
+#undef VSSSEG6
+#undef VSSSEG8
+#undef NR
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define NR 16
+
+void bli_zpackm_sifive_x280_asm_16xk
+ (
+ conj_t conja,
+ pack_t schema,
+ dim_t cdim,
+ dim_t n,
+ dim_t n_max,
+ const void* restrict kappa_,
+ const void* restrict a_, inc_t inca, inc_t lda,
+ void* restrict p_, inc_t ldp,
+ const cntx_t* cntx
+ )
+{
+ (void) cntx;
+ const dcomplex* kappa = kappa_;
+ const dcomplex* a = a_;
+ dcomplex* p = p_;
+
+ dcomplex kappa_cast = *kappa;
+ if (lda == 1) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v18, 0");
+ size_t avl = n;
+ while (avl) {
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+ dim_t cdim_tmp = cdim;
+ const dcomplex* a_tmp = a;
+ dcomplex* p_tmp = p;
+ while (cdim_tmp >= 4) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+ a_tmp += inca;
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v1, v1");
+ __asm__("vfneg.v v3, v3");
+ __asm__("vfneg.v v5, v5");
+ __asm__("vfneg.v v7, v7");
+ }
+ __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ }
+ p_tmp += 4;
+ cdim_tmp -= 4;
+ }
+ if (cdim_tmp > 0) {
+ a_tmp += (cdim_tmp - 1) * inca;
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 2:
+ __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+ a_tmp -= inca;
+ case 1:
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+ }
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ if (conja == BLIS_CONJUGATE) {
+ switch (cdim_tmp) {
+ case 3: __asm__("vfneg.v v5, v5");
+ case 2: __asm__("vfneg.v v3, v3");
+ case 1: __asm__("vfneg.v v1, v1");
+ }
+ }
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ }
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ switch (cdim_tmp) {
+ case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ else {
+ switch (cdim_tmp) {
+ case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+ case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+ case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+ }
+ }
+ switch (cdim_tmp) {
+ case 3:
+ __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 2:
+ __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ case 1:
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+ break;
+ }
+ }
+ p_tmp += cdim_tmp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+ for (size_t i = 0; i < vl; ++i) {
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+ p_tmp += ldp;
+ }
+ a += vl;
+ p += vl * ldp;
+ avl -= vl;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ else {
+ inca *= 2 * FLT_SIZE;
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v6, 0");
+ for (size_t i = 0; i < n; ++i) {
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+ if (inca == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+ }
+ if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+ if (conja == BLIS_CONJUGATE) {
+ __asm__("vfneg.v v2, v2");
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ }
+ else {
+ if (conja == BLIS_NO_CONJUGATE) {
+ vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+ }
+ else {
+ vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+ }
+ a += lda;
+ p += ldp;
+ }
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ for (size_t i = n; i < n_max; ++i) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+ p += ldp;
+ }
+ }
+ return;
+}
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
new file mode 100644
index 0000000000..b9715988d6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
@@ -0,0 +1,2405 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include
+#include
+#include
+#include
+
+// byte-size of the floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define PACKMR 8
+#define PACKNR 64
+
+void bli_sgemm_7m4
+ (
+ dim_t N,
+ dim_t K,
+ const float* restrict alpha,
+ const float* restrict a,
+ const float* restrict b,
+ const float* restrict beta,
+ float* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ bool first = true;
+ // compute a*b
+ for (dim_t k = 0; k < K; ++k) {
+ __asm__(VLE "v28, (%0)" : : "r"(b));
+ if (first) {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmul.vf v0, v28, ft0");
+
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmul.vf v4, v28, ft1");
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmul.vf v8, v28, ft2");
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmul.vf v12, v28, ft3");
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmul.vf v16, v28, ft4");
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmul.vf v20, v28, ft5");
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__("vfmul.vf v24, v28, ft6");
+
+ first = false;
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmacc.vf v0, ft0, v28");
+
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmacc.vf v4, ft1, v28");
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmacc.vf v8, ft2, v28");
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmacc.vf v12, ft3, v28");
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmacc.vf v16, ft4, v28");
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmacc.vf v20, ft5, v28");
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__("vfmacc.vf v24, ft6, v28");
+ }
+
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+ }
+
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+
+ // compute alpha*a*b + beta*c
+ if (*beta == 0.f) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("vfmul.vf v24, v24, ft10");
+ }
+ else { // beta != 0.f
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ float *c_tmp = c;
+ if (csc == FLT_SIZE) { // c unit column stride
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v0, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v24, v24, ft10");
+ __asm__("vfmacc.vf v24, ft11, v28");
+ } // end c unit column stride
+ else { // c non-unit column stride
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v0, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v24, v24, ft10");
+ __asm__("vfmacc.vf v24, ft11, v28");
+ } // end c non-unit column stride
+ } // end beta != 0.f
+
+ // store c
+ if (csc == FLT_SIZE) {
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v24, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ }
+
+ return;
+}
+
+void bli_sgemm_7m4_cleanup
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const float* restrict alpha,
+ const float* restrict a,
+ const float* restrict b,
+ const float* restrict beta,
+ float* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ bool first = true;
+ // compute a*b
+ for (dim_t k = 0; k < K; ++k) {
+ __asm__(VLE "v28, (%0)" : : "r"(b));
+ if (first) {
+ switch (M) {
+ case 6:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmul.vf v20, v28, ft5");
+ case 5:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmul.vf v16, v28, ft4");
+ case 4:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmul.vf v12, v28, ft3");
+ case 3:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmul.vf v8, v28, ft2");
+ case 2:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmul.vf v4, v28, ft1");
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmul.vf v0, v28, ft0");
+ }
+ first = false;
+ }
+ else {
+ switch (M) {
+ case 6:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmacc.vf v20, ft5, v28");
+ case 5:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmacc.vf v16, ft4, v28");
+ case 4:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmacc.vf v12, ft3, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmacc.vf v8, ft2, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmacc.vf v4, ft1, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmacc.vf v0, ft0, v28");
+ }
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+ }
+
+ c += (M - 1) * rsc;
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+
+ // compute alpha*a*b + beta*c
+ if (*beta == 0.f) {
+ switch (M) {
+ case 6:
+ __asm__("vfmul.vf v20, v20, ft10");
+ case 5:
+ __asm__("vfmul.vf v16, v16, ft10");
+ case 4:
+ __asm__("vfmul.vf v12, v12, ft10");
+ case 3:
+ __asm__("vfmul.vf v8, v8, ft10");
+ case 2:
+ __asm__("vfmul.vf v4, v4, ft10");
+ case 1:
+ __asm__("vfmul.vf v0, v0, ft10");
+ }
+ }
+ else { // beta != 0.f
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ float *c_tmp = c;
+ if (csc == FLT_SIZE) {
+ switch (M) {
+ case 6:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+ case 5:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+ case 4:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+ case 3:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+ case 2:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+ case 1:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmacc.vf v0, ft11, v28");
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 6:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+ case 5:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+ case 4:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+ case 3:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+ case 2:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+ case 1:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmacc.vf v0, ft11, v28");
+ }
+ } // end c non-unit column stride
+ } // end beta != 0.f
+
+ // store c
+ if (csc == FLT_SIZE) {
+ switch (M) {
+ case 6:
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ }
+ }
+ else {
+ switch (M) {
+ case 6:
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ }
+ return;
+}
+
+void bli_sgemm_7m4_k0
+ (
+ dim_t M,
+ dim_t N,
+ const float* restrict beta,
+ float* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+ // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ c += (M - 1) * rsc;
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+ if (*beta == 0.f) {
+ // set c to 0
+ __asm__("vmv.v.i v0, 0");
+ if (csc == FLT_SIZE) { // c unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ } // end c non-unit column stride
+ } // end beta == 0.f
+ else { // beta != 0.f
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+ if (csc == FLT_SIZE) { // c unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VLE "v24, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v24, v24, ft0");
+ __asm__(VSE "v24, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VLE "v20, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v20, v20, ft0");
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VLE "v16, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v16, v16, ft0");
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VLE "v12, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v12, v12, ft0");
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VLE "v8, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v8, v8, ft0");
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VLE "v4, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v4, v4, ft0");
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v24, v24, ft0");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft0");
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft0");
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft0");
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft0");
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft0");
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ } // end c non-unit column stride
+ } // end beta != 0.f
+ return;
+}
+
+void bli_sgemm_sifive_x280_asm_7m4
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const void* restrict alpha_,
+ const void* restrict a_,
+ const void* restrict b_,
+ const void* restrict beta_,
+ void* restrict c_, inc_t rsc, inc_t csc,
+ auxinfo_t* restrict data,
+ const cntx_t* restrict cntx
+ )
+{
+ (void) data;
+ (void) cntx;
+ const float* restrict alpha = alpha_;
+ const float* restrict beta = beta_;
+ const float* restrict a = a_;
+ const float* restrict b = b_;
+ float* restrict c = c_;
+
+ // M x N x K sgemm
+ if (M <= 0 || N <= 0 || K < 0)
+ return;
+ else if (K == 0)
+ bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc);
+ else if (M == 7)
+ bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+ else
+ bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of the floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define PACKMR 8
+#define PACKNR 32
+
+void bli_dgemm_7m4
+ (
+ dim_t N,
+ dim_t K,
+ const double* restrict alpha,
+ const double* restrict a,
+ const double* restrict b,
+ const double* restrict beta,
+ double* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ bool first = true;
+ // compute a*b
+ for (dim_t k = 0; k < K; ++k) {
+ __asm__(VLE "v28, (%0)" : : "r"(b));
+ if (first) {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmul.vf v0, v28, ft0");
+
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmul.vf v4, v28, ft1");
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmul.vf v8, v28, ft2");
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmul.vf v12, v28, ft3");
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmul.vf v16, v28, ft4");
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmul.vf v20, v28, ft5");
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__("vfmul.vf v24, v28, ft6");
+
+ first = false;
+ }
+ else {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmacc.vf v0, ft0, v28");
+
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmacc.vf v4, ft1, v28");
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmacc.vf v8, ft2, v28");
+
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmacc.vf v12, ft3, v28");
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmacc.vf v16, ft4, v28");
+
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmacc.vf v20, ft5, v28");
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__("vfmacc.vf v24, ft6, v28");
+ }
+
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+ }
+
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+
+ // compute alpha*a*b + beta*c
+ if (*beta == 0.) {
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("vfmul.vf v24, v24, ft10");
+ }
+ else { // beta != 0.
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ double *c_tmp = c;
+ if (csc == FLT_SIZE) { // c unit column stride
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v0, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v24, v24, ft10");
+ __asm__("vfmacc.vf v24, ft11, v28");
+ } // end c unit column stride
+ else { // c non-unit column stride
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v0, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v24, v24, ft10");
+ __asm__("vfmacc.vf v24, ft11, v28");
+ } // end c non-unit column stride
+ } // end beta != 0.
+
+ // store c
+ if (csc == FLT_SIZE) {
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSE "v24, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ }
+
+ return;
+}
+
+void bli_dgemm_7m4_cleanup
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const double* restrict alpha,
+ const double* restrict a,
+ const double* restrict b,
+ const double* restrict beta,
+ double* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+ __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ bool first = true;
+ // compute a*b
+ for (dim_t k = 0; k < K; ++k) {
+ __asm__(VLE "v28, (%0)" : : "r"(b));
+ if (first) {
+ switch (M) {
+ case 6:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmul.vf v20, v28, ft5");
+ case 5:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmul.vf v16, v28, ft4");
+ case 4:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmul.vf v12, v28, ft3");
+ case 3:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmul.vf v8, v28, ft2");
+ case 2:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmul.vf v4, v28, ft1");
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmul.vf v0, v28, ft0");
+ }
+ first = false;
+ }
+ else {
+ switch (M) {
+ case 6:
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ __asm__("vfmacc.vf v20, ft5, v28");
+ case 5:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__("vfmacc.vf v16, ft4, v28");
+ case 4:
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ __asm__("vfmacc.vf v12, ft3, v28");
+ case 3:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__("vfmacc.vf v8, ft2, v28");
+ case 2:
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ __asm__("vfmacc.vf v4, ft1, v28");
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__("vfmacc.vf v0, ft0, v28");
+ }
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+ }
+
+ c += (M - 1) * rsc;
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+
+ // compute alpha*a*b + beta*c
+ if (*beta == 0.) {
+ switch (M) {
+ case 6:
+ __asm__("vfmul.vf v20, v20, ft10");
+ case 5:
+ __asm__("vfmul.vf v16, v16, ft10");
+ case 4:
+ __asm__("vfmul.vf v12, v12, ft10");
+ case 3:
+ __asm__("vfmul.vf v8, v8, ft10");
+ case 2:
+ __asm__("vfmul.vf v4, v4, ft10");
+ case 1:
+ __asm__("vfmul.vf v0, v0, ft10");
+ }
+ }
+ else { // beta != 0.
+ __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+ double *c_tmp = c;
+ if (csc == FLT_SIZE) {
+ switch (M) {
+ case 6:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+ case 5:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+ case 4:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+ case 3:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+ case 2:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+ case 1:
+ __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmacc.vf v0, ft11, v28");
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 6:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v20, ft11, v28");
+ case 5:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v16, ft11, v28");
+ case 4:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v12, ft11, v28");
+ case 3:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v8, ft11, v28");
+ case 2:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft10");
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__("vfmacc.vf v4, ft11, v28");
+ case 1:
+ __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft10");
+ __asm__("vfmacc.vf v0, ft11, v28");
+ }
+ } // end c non-unit column stride
+ } // end beta != 0.
+
+ // store c
+ if (csc == FLT_SIZE) {
+ switch (M) {
+ case 6:
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ }
+ }
+ else {
+ switch (M) {
+ case 6:
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ }
+ return;
+}
+
+void bli_dgemm_7m4_k0
+ (
+ dim_t M,
+ dim_t N,
+ const double* restrict beta,
+ double* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+ // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+ __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ c += (M - 1) * rsc;
+ rsc *= FLT_SIZE;
+ csc *= FLT_SIZE;
+ if (*beta == 0.) {
+ // set c to 0
+ __asm__("vmv.v.i v0, 0");
+ if (csc == FLT_SIZE) { // c unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ } // end c non-unit column stride
+ } // end beta == 0.
+ else { // beta != 0.
+ __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+ if (csc == FLT_SIZE) { // c unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VLE "v24, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v24, v24, ft0");
+ __asm__(VSE "v24, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VLE "v20, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v20, v20, ft0");
+ __asm__(VSE "v20, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VLE "v16, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v16, v16, ft0");
+ __asm__(VSE "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VLE "v12, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v12, v12, ft0");
+ __asm__(VSE "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VLE "v8, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v8, v8, ft0");
+ __asm__(VSE "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VLE "v4, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v4, v4, ft0");
+ __asm__(VSE "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VLE "v0, (%0)" : : "r"(c));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__(VSE "v0, (%0)" : : "r"(c));
+
+ }
+ } // end c unit column stride
+ else { // c non-unit column stride
+ switch (M) {
+ case 7:
+ __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v24, v24, ft0");
+ __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 6:
+ __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v20, v20, ft0");
+ __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 5:
+ __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v16, v16, ft0");
+ __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v12, v12, ft0");
+ __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v8, v8, ft0");
+ __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v4, v4, ft0");
+ __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("vfmul.vf v0, v0, ft0");
+ __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ } // end c non-unit column stride
+ } // end beta != 0.
+ return;
+}
+
+void bli_dgemm_sifive_x280_asm_7m4
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const void* restrict alpha_,
+ const void* restrict a_,
+ const void* restrict b_,
+ const void* restrict beta_,
+ void* restrict c_, inc_t rsc, inc_t csc,
+ auxinfo_t* restrict data,
+ const cntx_t* restrict cntx
+ )
+{
+ (void) data;
+ (void) cntx;
+ const double* restrict alpha = alpha_;
+ const double* restrict beta = beta_;
+ const double* restrict a = a_;
+ const double* restrict b = b_;
+ double* restrict c = c_;
+
+ // M x N x K dgemm
+ if (M <= 0 || N <= 0 || K < 0)
+ return;
+ else if (K == 0)
+ bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc);
+ else if (M == 7)
+ bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+ else
+ bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define PACKMR 8
+#define PACKNR 32
+
+void bli_cgemm_6m2
+ (
+ dim_t N,
+ dim_t K,
+ const scomplex* restrict alpha,
+ const scomplex* restrict a,
+ const scomplex* restrict b,
+ const scomplex* restrict beta,
+ scomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 6 x N x K cgemm, N <= 32 = vlmax, K > 0
+ // pairs of register groups hold the real and imag. parts of rows of c and b
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ while (K > 0) {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+ K -= 1;
+
+ if (K == 0) { break; }
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+ __asm__("vfmul.vf v24, v2, ft1");
+ __asm__("vfmul.vf v26, v0, ft1");
+ __asm__("vfmul.vf v28, v6, ft1");
+ __asm__("vfmul.vf v30, v4, ft1");
+
+ __asm__("vfmsub.vf v0, ft0, v24");
+ __asm__("vfmadd.vf v2, ft0, v26");
+ __asm__("vfmsub.vf v4, ft0, v28");
+ __asm__("vfmadd.vf v6, ft0, v30");
+
+ __asm__("vfmul.vf v24, v10, ft1");
+ __asm__("vfmul.vf v26, v8, ft1");
+ __asm__("vfmul.vf v28, v14, ft1");
+ __asm__("vfmul.vf v30, v12, ft1");
+
+ __asm__("vfmsub.vf v8, ft0, v24");
+ __asm__("vfmadd.vf v10, ft0, v26");
+ __asm__("vfmsub.vf v12, ft0, v28");
+ __asm__("vfmadd.vf v14, ft0, v30");
+
+ __asm__("vfmul.vf v24, v18, ft1");
+ __asm__("vfmul.vf v26, v16, ft1");
+ __asm__("vfmul.vf v28, v22, ft1");
+ __asm__("vfmul.vf v30, v20, ft1");
+
+ __asm__("vfmsub.vf v16, ft0, v24");
+ __asm__("vfmadd.vf v18, ft0, v26");
+ __asm__("vfmsub.vf v20, ft0, v28");
+ __asm__("vfmadd.vf v22, ft0, v30");
+
+ scomplex beta_cast = *beta;
+ if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+ if (csc == 2 * FLT_SIZE) {
+ scomplex *c_tmp = c;
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+ vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+ }
+ else {
+ scomplex *c_tmp = c;
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+ vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+ }
+ }
+
+ if (csc == 2 * FLT_SIZE) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+ }
+
+ return;
+}
+
+void bli_cgemm_6m2_cleanup
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const scomplex* restrict alpha,
+ const scomplex* restrict a,
+ const scomplex* restrict b,
+ const scomplex* restrict beta,
+ scomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+ // pairs of register groups hold the real and imag. parts of rows of c and b
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+ }
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ while (K > 0) {
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+ }
+ K -= 1;
+
+ if (K == 0) { break; }
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+ }
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ c += (M - 1) * rsc;
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+ switch (M) {
+ case 5:
+ __asm__("vfmul.vf v24, v18, ft1");
+ __asm__("vfmul.vf v26, v16, ft1");
+ __asm__("vfmsub.vf v16, ft0, v24");
+ __asm__("vfmadd.vf v18, ft0, v26");
+ case 4:
+ __asm__("vfmul.vf v28, v14, ft1");
+ __asm__("vfmul.vf v30, v12, ft1");
+ __asm__("vfmsub.vf v12, ft0, v28");
+ __asm__("vfmadd.vf v14, ft0, v30");
+ case 3:
+ __asm__("vfmul.vf v24, v10, ft1");
+ __asm__("vfmul.vf v26, v8, ft1");
+ __asm__("vfmsub.vf v8, ft0, v24");
+ __asm__("vfmadd.vf v10, ft0, v26");
+ case 2:
+ __asm__("vfmul.vf v28, v6, ft1");
+ __asm__("vfmul.vf v30, v4, ft1");
+ __asm__("vfmsub.vf v4, ft0, v28");
+ __asm__("vfmadd.vf v6, ft0, v30");
+ case 1:
+ __asm__("vfmul.vf v24, v2, ft1");
+ __asm__("vfmul.vf v26, v0, ft1");
+ __asm__("vfmsub.vf v0, ft0, v24");
+ __asm__("vfmadd.vf v2, ft0, v26");
+ }
+
+ scomplex beta_cast = *beta;
+ if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+ if (csc == 2 * FLT_SIZE) {
+ scomplex *c_tmp = c;
+ switch (M) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+ case 4:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+ case 2:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+ }
+ }
+ else {
+ scomplex *c_tmp = c;
+ switch (M) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+ case 4:
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+ case 2:
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+ }
+ }
+ }
+
+ if (csc == 2 * FLT_SIZE) {
+ switch (M) {
+ case 5:
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ }
+ }
+ else {
+ switch (M) {
+ case 5:
+ __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ }
+
+ return;
+}
+
+void bli_cgemm_6m2_k0
+ (
+ dim_t M,
+ dim_t N,
+ const scomplex* restrict beta,
+ scomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+ // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ csc *= 2 * FLT_SIZE;
+
+ scomplex beta_cast = *beta;
+ if (beta_cast.real == 0.f && beta_cast.imag == 0.f) {
+ // set c to 0
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ for (size_t i = 0; i < M; ++i) {
+ if (csc == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ c += rsc;
+ }
+ }
+ else {
+ // scale c by beta
+ for (size_t i = 0; i < M; ++i) {
+ if (csc == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+ vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ c += rsc;
+ }
+ }
+ return;
+}
+
+void bli_cgemm_sifive_x280_asm_6m2
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const void* restrict alpha_,
+ const void* restrict a_,
+ const void* restrict b_,
+ const void* restrict beta_,
+ void* restrict c_, inc_t rsc, inc_t csc,
+ auxinfo_t* restrict data,
+ const cntx_t* restrict cntx
+ )
+{
+ // M x N x K cgemm
+ (void) data;
+ (void) cntx;
+ const scomplex* restrict alpha = alpha_;
+ const scomplex* restrict beta = beta_;
+ const scomplex* restrict a = a_;
+ const scomplex* restrict b = b_;
+ scomplex* restrict c = c_;
+
+ if (M <= 0 || N <= 0 || K < 0)
+ return;
+ else if (K == 0)
+ bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc);
+ else if (M == 6)
+ bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+ else
+ bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define PACKMR 8
+#define PACKNR 16
+
+void bli_zgemm_6m2
+ (
+ dim_t N,
+ dim_t K,
+ const dcomplex* restrict alpha,
+ const dcomplex* restrict a,
+ const dcomplex* restrict b,
+ const dcomplex* restrict beta,
+ dcomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 6 x N x K zgemm, N <= 32 = vlmax, K > 0
+ // pairs of register groups hold the real and imag. parts of rows of c and b
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ while (K > 0) {
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+ K -= 1;
+
+ if (K == 0) { break; }
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+ vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+ __asm__("vfmul.vf v24, v2, ft1");
+ __asm__("vfmul.vf v26, v0, ft1");
+ __asm__("vfmul.vf v28, v6, ft1");
+ __asm__("vfmul.vf v30, v4, ft1");
+
+ __asm__("vfmsub.vf v0, ft0, v24");
+ __asm__("vfmadd.vf v2, ft0, v26");
+ __asm__("vfmsub.vf v4, ft0, v28");
+ __asm__("vfmadd.vf v6, ft0, v30");
+
+ __asm__("vfmul.vf v24, v10, ft1");
+ __asm__("vfmul.vf v26, v8, ft1");
+ __asm__("vfmul.vf v28, v14, ft1");
+ __asm__("vfmul.vf v30, v12, ft1");
+
+ __asm__("vfmsub.vf v8, ft0, v24");
+ __asm__("vfmadd.vf v10, ft0, v26");
+ __asm__("vfmsub.vf v12, ft0, v28");
+ __asm__("vfmadd.vf v14, ft0, v30");
+
+ __asm__("vfmul.vf v24, v18, ft1");
+ __asm__("vfmul.vf v26, v16, ft1");
+ __asm__("vfmul.vf v28, v22, ft1");
+ __asm__("vfmul.vf v30, v20, ft1");
+
+ __asm__("vfmsub.vf v16, ft0, v24");
+ __asm__("vfmadd.vf v18, ft0, v26");
+ __asm__("vfmsub.vf v20, ft0, v28");
+ __asm__("vfmadd.vf v22, ft0, v30");
+
+ dcomplex beta_cast = *beta;
+ if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+ if (csc == 2 * FLT_SIZE) {
+ dcomplex *c_tmp = c;
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+ vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+ }
+ else {
+ dcomplex *c_tmp = c;
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+ vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+ }
+ }
+
+ if (csc == 2 * FLT_SIZE) {
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+ __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+ }
+
+ return;
+}
+
+void bli_zgemm_6m2_cleanup
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const dcomplex* restrict alpha,
+ const dcomplex* restrict a,
+ const dcomplex* restrict b,
+ const dcomplex* restrict beta,
+ dcomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+ // pairs of register groups hold the real and imag. parts of rows of c and b
+
+ __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+ }
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ while (K > 0) {
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+ }
+ K -= 1;
+
+ if (K == 0) { break; }
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ switch (M) {
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+ vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+ vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+ vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+ vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+ vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+ }
+ K -= 1;
+
+ if (K >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+ __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ c += (M - 1) * rsc;
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+ switch (M) {
+ case 5:
+ __asm__("vfmul.vf v24, v18, ft1");
+ __asm__("vfmul.vf v26, v16, ft1");
+ __asm__("vfmsub.vf v16, ft0, v24");
+ __asm__("vfmadd.vf v18, ft0, v26");
+ case 4:
+ __asm__("vfmul.vf v28, v14, ft1");
+ __asm__("vfmul.vf v30, v12, ft1");
+ __asm__("vfmsub.vf v12, ft0, v28");
+ __asm__("vfmadd.vf v14, ft0, v30");
+ case 3:
+ __asm__("vfmul.vf v24, v10, ft1");
+ __asm__("vfmul.vf v26, v8, ft1");
+ __asm__("vfmsub.vf v8, ft0, v24");
+ __asm__("vfmadd.vf v10, ft0, v26");
+ case 2:
+ __asm__("vfmul.vf v28, v6, ft1");
+ __asm__("vfmul.vf v30, v4, ft1");
+ __asm__("vfmsub.vf v4, ft0, v28");
+ __asm__("vfmadd.vf v6, ft0, v30");
+ case 1:
+ __asm__("vfmul.vf v24, v2, ft1");
+ __asm__("vfmul.vf v26, v0, ft1");
+ __asm__("vfmsub.vf v0, ft0, v24");
+ __asm__("vfmadd.vf v2, ft0, v26");
+ }
+
+ dcomplex beta_cast = *beta;
+ if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+ if (csc == 2 * FLT_SIZE) {
+ dcomplex *c_tmp = c;
+ switch (M) {
+ case 5:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+ case 4:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+ case 3:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+ case 2:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+ case 1:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+ }
+ }
+ else {
+ dcomplex *c_tmp = c;
+ switch (M) {
+ case 5:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+ case 4:
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+ case 3:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+ case 2:
+ __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+ vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+ case 1:
+ __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+ vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+ }
+ }
+ }
+
+ if (csc == 2 * FLT_SIZE) {
+ switch (M) {
+ case 5:
+ __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ }
+ }
+ else {
+ switch (M) {
+ case 5:
+ __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 4:
+ __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 3:
+ __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 2:
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+ case 1:
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ }
+
+ return;
+}
+
+void bli_zgemm_6m2_k0
+ (
+ dim_t M,
+ dim_t N,
+ const dcomplex* restrict beta,
+ dcomplex* restrict c, inc_t rsc, inc_t csc
+ )
+{
+ // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+ // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+ csc *= 2 * FLT_SIZE;
+
+ dcomplex beta_cast = *beta;
+ if (beta_cast.real == 0. && beta_cast.imag == 0.) {
+ // set c to 0
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ for (size_t i = 0; i < M; ++i) {
+ if (csc == 2 * FLT_SIZE)
+ __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+ else
+ __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ c += rsc;
+ }
+ }
+ else {
+ // scale c by beta
+ for (size_t i = 0; i < M; ++i) {
+ if (csc == 2 * FLT_SIZE) {
+ __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+ vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+ __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+ }
+ else {
+ __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+ vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+ __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+ }
+ c += rsc;
+ }
+ }
+ return;
+}
+
+void bli_zgemm_sifive_x280_asm_6m2
+ (
+ dim_t M,
+ dim_t N,
+ dim_t K,
+ const void* restrict alpha_,
+ const void* restrict a_,
+ const void* restrict b_,
+ const void* restrict beta_,
+ void* restrict c_, inc_t rsc, inc_t csc,
+ auxinfo_t* restrict data,
+ const cntx_t* restrict cntx
+ )
+{
+ // M x N x K zgemm
+ (void) data;
+ (void) cntx;
+ const dcomplex* restrict alpha = alpha_;
+ const dcomplex* restrict beta = beta_;
+ const dcomplex* restrict a = a_;
+ const dcomplex* restrict b = b_;
+ dcomplex* restrict c = c_;
+
+ if (M <= 0 || N <= 0 || K < 0)
+ return;
+ else if (K == 0)
+ bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc);
+ else if (M == 6)
+ bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+ else
+ bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+ return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
new file mode 100644
index 0000000000..18df010d05
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
@@ -0,0 +1,327 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+ (void) data;
+ (void) cntx;
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict a10 = a10_;
+ const DATATYPE* restrict a11 = a11_;
+ const DATATYPE* restrict b01 = b01_;
+ const DATATYPE* restrict b11 = b11_;
+ DATATYPE* restrict c11 = c11_;
+
+ if (m <= 0 || n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+ DATATYPE alpha_cast = *alpha;
+ if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+ switch (m) {
+ case 6:
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v22, 0");
+ case 5:
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v18, 0");
+ case 4:
+ __asm__("vmv.v.i v12, 0");
+ __asm__("vmv.v.i v14, 0");
+ case 3:
+ __asm__("vmv.v.i v8, 0");
+ __asm__("vmv.v.i v10, 0");
+ case 2:
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v6, 0");
+ case 1:
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ }
+ }
+ else {
+ const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR;
+ switch (m) {
+ case 6:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+ case 5:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+ case 3:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+ case 1:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+ }
+ }
+
+ if (k >= 1) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+ __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ if (k >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+ __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ while (k > 0) {
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+ vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+ }
+ k -= 1;
+
+ if (k == 0) { break; }
+
+ if (k >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+ __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+ vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+ }
+ k -= 1;
+
+ if (k >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+ __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+ vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 1) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+ __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+ vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 2) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+ __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+ vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 3) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+ __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+ vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 4) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+ __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+ vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 5) return;
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+ __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+ vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ return;
+}
+
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
new file mode 100644
index 0000000000..a0f9134731
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
@@ -0,0 +1,253 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict a10 = a10_;
+ const DATATYPE* restrict a11 = a11_;
+ const DATATYPE* restrict b01 = b01_;
+ const DATATYPE* restrict b11 = b11_;
+ DATATYPE* restrict c11 = c11_;
+
+ if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+ return;
+
+ dim_t b11_offset, temp;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+
+ // Multiply step sizes by data size
+ __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+ __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+
+ __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1));
+ __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE));
+ __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp));
+ // b11_offset = (m-1)*PACKNR*FLT_SIZE
+
+ __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset));
+ __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); // TO DO: optimize alpha = 1 case
+ switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+ case 7: __asm__(VLE " v0, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 6: __asm__(VLE " v4, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v4, v4, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 5: __asm__(VLE " v8, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v8, v8, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v12, v12, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v16, v16, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v20, v20, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v24, v24, f0");
+ // no sub of b11 on final entry
+ }
+ // b11 now reset to original value
+ // v0 = row 6 of b11
+ // v4 = row 5 of b11
+ // v8 = row 4 of b11
+ // v12 = row 3 of b11
+ // v16 = row 2 of b11
+ // v20 = row 1 of b11
+ // v24 = row 0 of b11
+
+ // GEMM: B11 := alpha * B11 - A10 * B01
+ for (dim_t i = 0; i < k; i++){
+ __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v0, f6, v28");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v4, f5, v28");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v8, f4, v28");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v12, f3, v28");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v16, f2, v28");
+ case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v20, f1, v28");
+ case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10));
+ __asm__("vfnmsac.vf v24, f0, v28");
+ }
+ __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE));
+ }
+ // TRSM: B11 := inv(A11) * B11
+ // TO DO: Investigate code size reduction (loop rerolling)
+
+ // Row 0
+ __asm__(FLT_LOAD " f0, %0(%1)": : "I"(0*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v24, v24, f0");
+ __asm__(VSE " v24, (%0)": : "r"(b11));
+ __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 1) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v24");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v24");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v24");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v24");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v16, f2, v24");
+ case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v20, f1, v24");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 1
+ __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v20, v20, f1");
+ __asm__(VSE " v20, (%0)": : "r"(b11));
+ __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 2) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v20");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v20");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v20");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v20");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v16, f2, v20");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 2
+ __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v16, v16, f2");
+ __asm__(VSE " v16, (%0)": : "r"(b11));
+ __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 3) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v16");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v16");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v16");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v16");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 3
+ __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v12, v12, f3");
+ __asm__(VSE " v12, (%0)": : "r"(b11));
+ __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 4) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v12");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v12");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v12");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 4
+ __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v8, v8, f4");
+ __asm__(VSE " v8, (%0)": : "r"(b11));
+ __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 5) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v8");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v8");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 5
+ __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v4, v4, f5");
+ __asm__(VSE " v4, (%0)": : "r"(b11));
+ __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 6) return;
+
+ __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v4");
+
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 6
+ __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v0, v0, f6");
+ __asm__(VSE " v0, (%0)": : "r"(b11));
+ __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
new file mode 100644
index 0000000000..4323f8fbf6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
@@ -0,0 +1,182 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../../riscv_cmul_macros_asm.h"
+#include
+#include
+
+#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\
+ dim_t m, \
+ dim_t n, \
+ dim_t k, \
+ const T* restrict alpha_, \
+ const T* restrict a10_, \
+ const T* restrict a11_, \
+ const T* restrict b01_, \
+ T* restrict b11_, \
+ T* restrict c11_, \
+ inc_t rsc, \
+ inc_t csc, \
+ auxinfo_t* restrict data, \
+ const cntx_t* restrict cntx \
+ )
+
+#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\
+ dim_t m, \
+ dim_t n, \
+ dim_t k, \
+ const T* restrict alpha_, \
+ const T* restrict a12_, \
+ const T* restrict a11_, \
+ const T* restrict b21_, \
+ T* restrict b11_, \
+ T* restrict c11_, \
+ inc_t rsc, \
+ inc_t csc, \
+ auxinfo_t* restrict data, \
+ const cntx_t* restrict cntx \
+ )
+
+#define GEMMTRSM(macro, ...) macro(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PACKMR 8
+#define PACKNR 64
+#define VLE "vle32.v"
+#define VSE "vse32.v"
+#define VSSE "vsse32.v"
+#define FLT_LOAD "flw"
+#define FLT_SIZE sizeof(float)
+#define LOG_FLT_SIZE 2
+
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PACKMR 8
+#define PACKNR 32
+#define VLE "vle64.v"
+#define VSE "vse64.v"
+#define VSSE "vsse64.v"
+#define FLT_LOAD "fld"
+#define FLT_SIZE sizeof(double)
+#define LOG_FLT_SIZE 3
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define PRECISION_CHAR c
+#define PACKMR 8
+#define PACKNR 32
+#define VLSEG2 "vlseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define FLT_LOAD "flw "
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define PRECISION_CHAR z
+#define PACKMR 8
+#define PACKNR 16
+#define VLSEG2 "vlseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define FLT_LOAD "fld "
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG
+#undef VSSEG
+#undef VSSSEG
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+
+
+#undef GEMMTRSM
+#undef GEMMTRSM_L
+#undef GEMMTRSM_U
+
+
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
new file mode 100644
index 0000000000..9332fd0963
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
@@ -0,0 +1,331 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+ (void) data;
+ (void) cntx;
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict a12 = a12_;
+ const DATATYPE* restrict a11 = a11_;
+ const DATATYPE* restrict b21 = b21_;
+ const DATATYPE* restrict b11 = b11_;
+ DATATYPE* restrict c11 = c11_;
+
+ if (m <= 0 || n <= 0)
+ return;
+
+ __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+ DATATYPE alpha_cast = *alpha;
+ if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+ switch (m) {
+ case 6:
+ __asm__("vmv.v.i v20, 0");
+ __asm__("vmv.v.i v22, 0");
+ case 5:
+ __asm__("vmv.v.i v16, 0");
+ __asm__("vmv.v.i v18, 0");
+ case 4:
+ __asm__("vmv.v.i v12, 0");
+ __asm__("vmv.v.i v14, 0");
+ case 3:
+ __asm__("vmv.v.i v8, 0");
+ __asm__("vmv.v.i v10, 0");
+ case 2:
+ __asm__("vmv.v.i v4, 0");
+ __asm__("vmv.v.i v6, 0");
+ case 1:
+ __asm__("vmv.v.i v0, 0");
+ __asm__("vmv.v.i v2, 0");
+ }
+ }
+ else {
+ const DATATYPE* b11_tmp = b11;
+ switch (m) {
+ case 6:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+ case 5:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+ case 4:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+ case 3:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+ case 2:
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+ __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+ case 1:
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+ vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+ }
+ }
+
+ if (k >= 1) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+ __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ if (k >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+ __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+
+ a12 += m - 1;
+
+ while (k > 0) {
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+ vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+ }
+ k -= 1;
+
+ if (k == 0) { break; }
+
+ if (k >= 2) {
+ __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+ __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+ case 1:
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+ vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+ }
+ k -= 1;
+
+ if (k >= 2) {
+ __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+ __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+ }
+
+ a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR
+ b11 += (m - 1) * PACKNR;
+ c11 += (m - 1) * rsc;
+ rsc *= 2 * FLT_SIZE;
+ csc *= 2 * FLT_SIZE;
+
+ __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+ vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 1) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ case 2:
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+ vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+ __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+ vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 2) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ case 3:
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+ vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+ __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+ vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 3) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ case 4:
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+ vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+ __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+ vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 4) return;
+
+ switch (m) {
+ case 6:
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+ case 5:
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+ vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+ }
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+ __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+ vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ if (m == 5) return;
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+ __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+ __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+ __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+ __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+ __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+ vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+ __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+ __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+ return;
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
new file mode 100644
index 0000000000..2d511a8ba6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
@@ -0,0 +1,260 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+ const DATATYPE* restrict alpha = alpha_;
+ const DATATYPE* restrict a12 = a12_;
+ const DATATYPE* restrict a11 = a11_;
+ const DATATYPE* restrict b21 = b21_;
+ const DATATYPE* restrict b11 = b11_;
+ DATATYPE* restrict c11 = c11_;
+
+ if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+ return;
+
+ dim_t m_sz, a11_offset, c11_offset, temp;
+ size_t vl;
+ __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+
+ // Multiply step sizes by data size
+ __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+ __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+ __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE));
+
+ __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE));
+ __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp));
+ __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE));
+ __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc));
+ __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc));
+ // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE
+ // c11_offset = rsc*(m-1)*sz
+
+ __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));
+ switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+ case 7: __asm__(VLE " v0, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v0, v0, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 6: __asm__(VLE " v4, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v4, v4, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 5: __asm__(VLE " v8, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v8, v8, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v12, v12, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v16, v16, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v20, v20, f0");
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+ case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+ __asm__("vfmul.vf v24, v24, f0");
+ // no add of b11 on final entry
+ }
+ // b11 now positioned at start of last row
+ // v24 = row 0 from bottom (bottom row)
+ // v20 = row 1 from bottom
+ // v16 = row 2 from bottom
+ // v12 = row 3 from bottom
+ // v8 = row 4 from bottom
+ // v4 = row 5 from bottom
+ // v0 = row 6 from bottom
+
+ // GEMM: B11 := alpha * B11 - A12 * B21
+ __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz));
+ for (dim_t i = 0; i < k; i++){
+ __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v0, f6, v28");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v4, f5, v28");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v8, f4, v28");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v12, f3, v28");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v16, f2, v28");
+ case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v20, f1, v28");
+ case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12));
+ __asm__("vfnmsac.vf v24, f0, v28");
+ }
+ __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE));
+ }
+ // TRSM: B11 := inv(A11) * B11
+ // Move a11 to end of array and c11 to first entry in last row
+ __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset));
+ __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset));
+
+ // Row 0 from bottom (bottom row)
+ __asm__(FLT_LOAD " f0, %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v24, v24, f0");
+ __asm__(VSE " v24, (%0)": : "r"(b11));
+ __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 1) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v24");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v24");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v24");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v24");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v16, f2, v24");
+ case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v20, f1, v24");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 1 from bottom
+ __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v20, v20, f1");
+ __asm__(VSE " v20, (%0)": : "r"(b11));
+ __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 2) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v20");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v20");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v20");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v20");
+ case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v16, f2, v20");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 2 from bottom
+ __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v16, v16, f2");
+ __asm__(VSE " v16, (%0)": : "r"(b11));
+ __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 3) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v16");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v16");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v16");
+ case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v12, f3, v16");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 3 from bottom
+ __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v12, v12, f3");
+ __asm__(VSE " v12, (%0)": : "r"(b11));
+ __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 4) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v12");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v12");
+ case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v8, f4, v12");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 4 from bottom
+ __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v8, v8, f4");
+ __asm__(VSE " v8, (%0)": : "r"(b11));
+ __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 5) return;
+
+ switch (m){
+ case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v8");
+ case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v4, f5, v8");
+ }
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 5 from bottom
+ __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v4, v4, f5");
+ __asm__(VSE " v4, (%0)": : "r"(b11));
+ __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+ if (m == 6) return;
+
+ __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfnmsac.vf v0, f6, v4");
+
+ // Pointer bumps
+ __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+ __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+ __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+ // Row 6 from bottom
+ __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+ __asm__("vfmul.vf v0, v0, f6");
+ __asm__(VSE " v0, (%0)": : "r"(b11));
+ __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+
+}
+#endif
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
new file mode 100644
index 0000000000..425c7dad92
--- /dev/null
+++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h
@@ -0,0 +1,160 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Level 1
+ADDV_KER_PROT(float, s, addv_sifive_x280_intr)
+ADDV_KER_PROT(double, d, addv_sifive_x280_intr)
+ADDV_KER_PROT(scomplex, c, addv_sifive_x280_intr)
+ADDV_KER_PROT(dcomplex, z, addv_sifive_x280_intr)
+
+AMAXV_KER_PROT(float, s, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(double, d, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_asm)
+
+AXPBYV_KER_PROT(float, s, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(double, d, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(scomplex, c, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(dcomplex, z, axpbyv_sifive_x280_intr)
+
+AXPYV_KER_PROT(float, s, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(double, d, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(scomplex, c, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_x280_intr)
+
+COPYV_KER_PROT(float, s, copyv_sifive_x280_asm)
+COPYV_KER_PROT(double, d, copyv_sifive_x280_asm)
+COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_asm)
+COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_asm)
+
+DOTV_KER_PROT(float, s, dotv_sifive_x280_intr)
+DOTV_KER_PROT(double, d, dotv_sifive_x280_intr)
+DOTV_KER_PROT(scomplex, c, dotv_sifive_x280_intr)
+DOTV_KER_PROT(dcomplex, z, dotv_sifive_x280_intr)
+
+DOTXV_KER_PROT(float, s, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(double, d, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(scomplex, c, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_x280_intr)
+
+INVERTV_KER_PROT(float, s, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(double, d, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_asm)
+
+INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm)
+
+SCAL2V_KER_PROT(float, s, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(double, d, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(scomplex, c, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(dcomplex, z, scal2v_sifive_x280_intr)
+
+SCALV_KER_PROT(float, s, scalv_sifive_x280_intr)
+SCALV_KER_PROT(double, d, scalv_sifive_x280_intr)
+SCALV_KER_PROT(scomplex, c, scalv_sifive_x280_intr)
+SCALV_KER_PROT(dcomplex, z, scalv_sifive_x280_intr)
+
+SETV_KER_PROT(float, s, setv_sifive_x280_asm)
+SETV_KER_PROT(double, d, setv_sifive_x280_asm)
+SETV_KER_PROT(scomplex, c, setv_sifive_x280_asm)
+SETV_KER_PROT(dcomplex, z, setv_sifive_x280_asm)
+
+SUBV_KER_PROT(float, s, subv_sifive_x280_intr)
+SUBV_KER_PROT(double, d, subv_sifive_x280_intr)
+SUBV_KER_PROT(scomplex, c, subv_sifive_x280_intr)
+SUBV_KER_PROT(dcomplex, z, subv_sifive_x280_intr)
+
+SWAPV_KER_PROT(float, s, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(double, d, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_asm)
+
+XPBYV_KER_PROT(float, s, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(double, d, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(scomplex, c, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(dcomplex, z, xpbyv_sifive_x280_intr)
+
+// Level 1f
+AXPY2V_KER_PROT(float, s, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(double, d, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_x280_intr)
+
+AXPYF_KER_PROT(float, s, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(double, d, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_asm)
+
+DOTXF_KER_PROT(float, s, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(double, d, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_asm)
+
+DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr)
+
+DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm)
+
+// Level 1m
+PACKM_KER_PROT(float, s, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(double, d, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(float, s, packm_sifive_x280_asm_64xk)
+PACKM_KER_PROT(double, d, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_16xk)
+
+// Level 3
+GEMM_UKR_PROT(float, s, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(double, d, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_asm_6m2)
+GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_asm_6m2)
+
+GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h
new file mode 100644
index 0000000000..9c33fd7bc5
--- /dev/null
+++ b/kernels/sifive_x280/riscv_cmul_macros_asm.h
@@ -0,0 +1,137 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// macros to emit complex multiplication
+// caveat: the destination registers cannot overlap the source registers!
+// rd = rs1 * rs2
+#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \
+ \
+ __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\
+ __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\
+ __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\
+ __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i)
+
+// vd = vs2 * f[rs1]
+#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+ \
+ __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+ __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+ __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+ __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+ \
+ __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+ __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+ __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+ __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd = conj(vs2) * f[rs1]
+#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+ \
+ __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+ __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+ __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+ __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+ \
+ __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+ __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+ __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+ __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += vs2 * f[rs1]
+#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+ \
+ __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+ __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+ __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+ __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+ \
+ __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\
+ __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\
+ __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+ __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += conj(vs2) * f[rs1]
+#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+ \
+ __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+ __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+ __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+ __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd -= vs2 * f[rs1]
+#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+ \
+ __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\
+ __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\
+ __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+ __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd = vs2 * vs1
+#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+ \
+ __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+ __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+ __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+ __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd = vs2 * conj(vs1)
+#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+ \
+ __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+ __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+ __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+ __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * vs1
+#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+ \
+ __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+ __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\
+ __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+ __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * conj(vs1)
+#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+ \
+ __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+ __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\
+ __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+ __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
new file mode 100644
index 0000000000..6a1d11b131
--- /dev/null
+++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
@@ -0,0 +1,116 @@
+/*
+
+ BLIS
+ An object-based framework for developing high-performance BLAS-like
+ libraries.
+
+ Copyright (C) 2023, SiFive, Inc.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name(s) of the copyright holder(s) nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// 6. Configuration-Setting and Utility Functions
+#define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t
+#define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL)
+#define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t
+#define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS)
+#define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL
+#define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL)
+
+// 7. Vector Loads and Stores
+// Loads
+#define VLE_V_F_(PRECISION, LMUL) __riscv_vle##PRECISION##_v_f##PRECISION##LMUL
+#define VLE_V_F(PRECISION, LMUL) VLE_V_F_(PRECISION, LMUL)
+#define VLSE_V_F_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL
+#define VLSE_V_F(PRECISION, LMUL) VLSE_V_F_(PRECISION, LMUL)
+#define VLSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+// Stores
+#define VSE_V_F_(PRECISION, LMUL) __riscv_vse##PRECISION##_v_f##PRECISION##LMUL
+#define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL)
+#define VSSE_V_F_(PRECISION, LMUL) __riscv_vsse##PRECISION##_v_f##PRECISION##LMUL
+#define VSSE_V_F(PRECISION, LMUL) VSSE_V_F_(PRECISION, LMUL)
+#define VSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+
+// 13. Vector Floating-Point Operations
+#define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL
+#define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL)
+#define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL
+#define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL
+#define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL
+#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL)
+#define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL
+#define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL)
+#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu
+#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL)
+#define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL
+#define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL
+#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu
+#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL)
+#define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL
+#define VFMADD_VF(PRECISION, LMUL) VFMADD_VF_(PRECISION, LMUL)
+#define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL
+#define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL)
+#define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL
+#define VFNEG_VF(PRECISION, LMUL) VFNEG_VF_(PRECISION, LMUL)
+#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG
+#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL)
+
+// 14. Vector Reduction Operations
+#define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1
+#define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL)
+
+// 16. Vector Permutation Operations
+#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL
+#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL)
+#define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION
+#define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION)
+
+// Miscellaneous Vector Function
+#define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL
+#define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL)
+#define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL
+#define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL)
+#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL
+#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS
+#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS)
+
+// Non-vector functions
+#define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__))
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 13bb8ea4c7..56750edf57 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -135,15 +135,16 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops )
err_t r_val;
#ifdef BLIS_ENABLE_HPX
+ size_t nt = ( size_t )params->n_app_threads;
- size_t tdata_size = ( size_t )params->n_app_threads *
+ size_t tdata_size = ( size_t )nt *
( size_t )sizeof( thread_data_t );
thread_data_t* tdata = bli_malloc_user( tdata_size, &r_val );
tdata->params = params;
tdata->ops = ops;
tdata->nt = nt;
- tdata->id = 1;
+ tdata->id = 0;
tdata->xc = 0;
// Walk through all test modules.
diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh
index a51d33061a..56c2b85c26 100755
--- a/travis/do_riscv.sh
+++ b/travis/do_riscv.sh
@@ -3,16 +3,19 @@
set -e
set -x
-TAG=2023.02.25
+TAG=2023.10.18
# The prebuilt toolchains only support hardfloat, so we only
# test these for now.
case $1 in
"rv32iv")
- TARBALL=riscv32-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+ TARBALL=riscv32-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
;;
"rv64iv")
- TARBALL=riscv64-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+ TARBALL=riscv64-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
+ ;;
+ "sifive_x280")
+ TARBALL=riscv64-glibc-ubuntu-20.04-llvm-nightly-${TAG}-nightly.tar.gz
;;
*)
exit 1