diff --git a/.travis.yml b/.travis.yml
index 6cb75cf877..d2a1fb842e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -86,6 +86,11 @@ matrix:
     env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \
       CC=riscv32-unknown-linux-gnu-gcc \
       LDFLAGS=-static
+  - os: linux
+    compiler: clang
+    env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="sifive_x280" \
+      CC=clang \
+      LDFLAGS=-static
 install:
 - if [ "$CC" = "gcc"  ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi
 - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi
@@ -106,6 +111,12 @@ script:
     export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++;
     export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000";
   fi
+- if [ "$CONF" = "sifive_x280" ]; then
+    $DIST_PATH/travis/do_riscv.sh "$CONF";
+    export CC=$DIST_PATH/../toolchain/riscv/bin/clang;
+    export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++;
+    export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000";
+  fi
 - $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF
 - pwd
 - ls -l
diff --git a/CREDITS b/CREDITS
index b81ca66526..03953b1a19 100644
--- a/CREDITS
+++ b/CREDITS
@@ -17,11 +17,13 @@ but many others have contributed code, ideas, and feedback, including
   Alex Arslan              @ararslan
   Vernon Austel                                       (IBM, T.J. Watson Research Center)
   Mohsen Aznaveh           @Aznaveh                   (Texas A&M University)
+  Abhishek Bagusetty       @abagusetty                (Argonne National Laboratory)
   Satish Balay             @balay                     (Argonne National Laboratory)
   Kihiro Bando             @bandokihiro
   Matthew Brett            @matthew-brett             (University of Birmingham)
   Jérémie du Boisberranger @jeremiedbb
   Jed Brown                @jedbrown                  (Argonne National Laboratory)
+  Alex Chiang              @alexsifivetw              (SiFive)
   Robin Christ             @robinchrist
   Dilyn Corner             @dilyn-corner
   Mat Cross                @matcross                  (NAG)
@@ -37,12 +39,14 @@ but many others have contributed code, ideas, and feedback, including
   Victor Eijkhout          @VictorEijkhout            (Texas Advanced Computing Center)
   Evgeny Epifanovsky       @epifanovsky               (Q-Chem)
   Isuru Fernando           @isuruf
+  James Foster             @jd-foster                 (CSIRO)
   Roman Gareev             @gareevroman
   Richard Goldschmidt      @SuperFluffy
   Chris Goodyer
   Alexander Grund          @Flamefire
   John Gunnels             @jagunnels                 (IBM, T.J. Watson Research Center)
   Ali Emre Gülcü           @Lephar
+                           @h-vetinari
   Jeff Hammond             @jeffhammond               (Intel)
   Jacob Gorm Hansen        @jacobgorm
   Shivaprashanth H                                    (Global Edge)
@@ -52,7 +56,9 @@ but many others have contributed code, ideas, and feedback, including
   Minh Quan Ho             @hominhquan
   Matthew Honnibal         @honnibal
   Stefan Husmann           @stefanhusmann
+  Aaron Hutchinson         @Aaron-Hutchinson          (SiFive)
   Francisco Igual          @figual                    (Universidad Complutense de Madrid)
+  John Mather              @jmather-sesi              (SideFX Software)
   Madeesh Kannan           @shadeMe
   Tony Kelman              @tkelman
   Lee Killough             @leekillough               (Tactical Computing Labs)
@@ -125,12 +131,12 @@ but many others have contributed code, ideas, and feedback, including
   Meghana Vankadari        @Meghana-vankadari         (AMD)
   Kiran Varaganti          @kvaragan                  (AMD)
   Natalia Vassilieva                                  (Hewlett Packard Enterprise)
-                           @h-vetinari
   Andrew Wildman           @awild82                   (University of Washington)
   Zhang Xianyi             @xianyi                    (Chinese Academy of Sciences)
   Benda Xu                 @heroxbd
   Guodong Xu               @docularxu                 (Linaro.org)
   RuQing Xu                @xrq-phys                  (The University of Tokyo)
+  Srinivas Yadav           @srinivasyadav18
   Costas Yamin             @cosstas
   Chenhan Yu               @ChenhanYu                 (The University of Texas at Austin)
   Roman Yurchak            @rth                       (Symerio)
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index d3871d8f77..a61d1b95d4 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -69,11 +69,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     8,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,  1024,     0,   768 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,  2048,     0,  1536 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 10240,     0, 10240 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     8,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,  1024,    -1,   768 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  2048,    -1,  1536 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1, 10240,    -1, 10240 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index 6af3ff91ce..55a8000e74 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -69,11 +69,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   432,   176,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   352,   368,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   432,   176,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   352,   368,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 8f615588c6..bbaf37541b 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -67,13 +67,13 @@ void bli_cntx_init_knc( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,    30,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,   120,     0,     0,
-	                                             0,   160,     0,     0 );
-	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,
-	                                             0,   300,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 14400,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    30,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1,
+	                                            -1,   160,    -1,    -1 );
+	bli_blksz_init     ( &blkszs[ BLIS_KC ],    -1,   240,    -1,    -1,
+	                                            -1,   300,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1, 14400,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 964438e834..30b3ac9fa4 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -77,11 +77,11 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   768,   384,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   384,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     8,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     4,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   768,   384,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   384,   384,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4096,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index d5ffe7dcfa..9d1de3da5c 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -67,11 +67,11 @@ void bli_cntx_init_power7( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     8,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     4,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,    64,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,   256,     0,     0 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0,  4096,     0,     0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     4,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    64,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4096,    -1,    -1 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c
new file mode 100644
index 0000000000..197394c822
--- /dev/null
+++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c
@@ -0,0 +1,226 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_cntx_init_sifive_x280( cntx_t* cntx )
+{
+	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+	// Set default kernel blocksizes and functions.
+	bli_cntx_init_sifive_x280_ref( cntx );
+
+	// -------------------------------------------------------------------------
+
+	// Update the context with optimized native kernels.
+	bli_cntx_set_ukrs
+	(
+	  cntx,
+
+	  // Level 1
+	  BLIS_ADDV_KER,       BLIS_FLOAT,    bli_saddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_DOUBLE,   bli_daddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr,
+	  BLIS_ADDV_KER,       BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr,
+
+	  BLIS_AMAXV_KER,      BLIS_FLOAT,    bli_samaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_DOUBLE,   bli_damaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm,
+	  BLIS_AMAXV_KER,      BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm,
+
+	  BLIS_AXPBYV_KER,     BLIS_FLOAT,    bli_saxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DOUBLE,   bli_daxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr,
+	  BLIS_AXPBYV_KER,     BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr,
+
+	  BLIS_AXPYV_KER,      BLIS_FLOAT,    bli_saxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_DOUBLE,   bli_daxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr,
+	  BLIS_AXPYV_KER,      BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr,
+
+	  BLIS_COPYV_KER,      BLIS_FLOAT,    bli_scopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_DOUBLE,   bli_dcopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm,
+	  BLIS_COPYV_KER,      BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm,
+
+	  BLIS_DOTV_KER,       BLIS_FLOAT,    bli_sdotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_DOUBLE,   bli_ddotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr,
+	  BLIS_DOTV_KER,       BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr,
+
+	  BLIS_DOTXV_KER,      BLIS_FLOAT,    bli_sdotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_DOUBLE,   bli_ddotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr,
+	  BLIS_DOTXV_KER,      BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr,
+
+	  BLIS_INVERTV_KER,    BLIS_FLOAT,    bli_sinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_DOUBLE,   bli_dinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm,
+	  BLIS_INVERTV_KER,    BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm,
+
+	  BLIS_INVSCALV_KER,   BLIS_FLOAT,    bli_sinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_DOUBLE,   bli_dinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm,
+	  BLIS_INVSCALV_KER,   BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm,
+
+	  BLIS_SCAL2V_KER,     BLIS_FLOAT,    bli_sscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DOUBLE,   bli_dscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr,
+	  BLIS_SCAL2V_KER,     BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr,
+
+	  BLIS_SCALV_KER,      BLIS_FLOAT,    bli_sscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_DOUBLE,   bli_dscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr,
+	  BLIS_SCALV_KER,      BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr,
+
+	  BLIS_SETV_KER,       BLIS_FLOAT,    bli_ssetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_DOUBLE,   bli_dsetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm,
+	  BLIS_SETV_KER,       BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm,
+
+	  BLIS_SUBV_KER,       BLIS_FLOAT,    bli_ssubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_DOUBLE,   bli_dsubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr,
+	  BLIS_SUBV_KER,       BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr,
+
+	  BLIS_SWAPV_KER,      BLIS_FLOAT,    bli_sswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_DOUBLE,   bli_dswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm,
+	  BLIS_SWAPV_KER,      BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm,
+
+	  BLIS_XPBYV_KER,      BLIS_FLOAT,    bli_sxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_DOUBLE,   bli_dxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr,
+	  BLIS_XPBYV_KER,      BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr,
+
+	  // Level 1f
+	  BLIS_AXPY2V_KER,     BLIS_FLOAT,    bli_saxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DOUBLE,   bli_daxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr,
+	  BLIS_AXPY2V_KER,     BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr,
+
+	  BLIS_AXPYF_KER,      BLIS_FLOAT,    bli_saxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_DOUBLE,   bli_daxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm,
+	  BLIS_AXPYF_KER,      BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm,
+
+	  BLIS_DOTXF_KER,      BLIS_FLOAT,    bli_sdotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_DOUBLE,   bli_ddotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm,
+	  BLIS_DOTXF_KER,      BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm,
+
+	  BLIS_DOTAXPYV_KER,   BLIS_FLOAT,    bli_sdotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DOUBLE,   bli_ddotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr,
+	  BLIS_DOTAXPYV_KER,   BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr,
+
+	  BLIS_DOTXAXPYF_KER,  BLIS_FLOAT,    bli_sdotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DOUBLE,   bli_ddotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm,
+	  BLIS_DOTXAXPYF_KER,  BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm,
+
+	  // Level 1m
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_7xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_7xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_sifive_x280_asm_64xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_sifive_x280_asm_32xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_32xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_16xk,
+
+	  // Level 3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_sifive_x280_asm_7m4,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_sifive_x280_asm_7m4,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2,
+
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm,
+
+	  BLIS_VA_END
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,             BLIS_DCOMPLEX, TRUE,
+
+	  BLIS_VA_END
+	);
+
+	// Initialize level-3 blocksize objects with architecture-specific values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR ],     7,     7,     6,     6,
+	                                             8,     8,     8,     8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    64,    32,    32,    16 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    28,    28,    24,    24 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  1024,  1024,  1024,  1024 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   128,   256,   128 );
+	// Default BLIS_BBM_s = 1, but set here to ensure it's correct
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    1,     1,     1,     1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    1,     1,     1,     1 );
+
+	// Update the context with the current architecture's register and cache
+	// blocksizes (and multiples) for native execution.
+	bli_cntx_set_blkszs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
+	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
+	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
+	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
+	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
+	  // level-1m
+	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+
+	  BLIS_VA_END
+	);
+}
+
diff --git a/config/sifive_x280/bli_family_sifive_x280.h b/config/sifive_x280/bli_family_sifive_x280.h
new file mode 100644
index 0000000000..4f02c048fa
--- /dev/null
+++ b/config/sifive_x280/bli_family_sifive_x280.h
@@ -0,0 +1,34 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
diff --git a/config/sifive_x280/bli_kernel_defs_sifive_x280.h b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
new file mode 100644
index 0000000000..bb6865a669
--- /dev/null
+++ b/config/sifive_x280/bli_kernel_defs_sifive_x280.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+#define BLIS_MR_s   7
+#define BLIS_MR_d   7
+#define BLIS_MR_c   6
+#define BLIS_MR_z   6
+
+#define BLIS_PACKMR_s   8
+#define BLIS_PACKMR_d   8
+#define BLIS_PACKMR_c   8
+#define BLIS_PACKMR_z   8
+
+#define BLIS_NR_s   64
+#define BLIS_NR_d   32
+#define BLIS_NR_c   32
+#define BLIS_NR_z   16
+//#endif
+
diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk
new file mode 100644
index 0000000000..acdf5a3611
--- /dev/null
+++ b/config/sifive_x280/make_defs.mk
@@ -0,0 +1,78 @@
+#
+#
+#  BLIS
+#  An object-based framework for developing high-performance BLAS-like
+#  libraries.
+#
+#  Copyright (C) 2023, SiFive, Inc.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+#   - Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   - Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   - Neither the name(s) of the copyright holder(s) nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+
+# Declare the name of the current configuration and add it to the
+# running list of configurations included by common.mk.
+THIS_CONFIG    := sifive_x280
+#CONFIGS_INCL   += $(THIS_CONFIG)
+
+#
+# --- Determine the C compiler and related flags ---
+#
+
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d
+CPPROCFLAGS    :=
+CMISCFLAGS     := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \
+                  -fdiagnostics-color=always -fno-rtti -fno-exceptions
+CPICFLAGS      := -fPIC
+CWARNFLAGS     := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \
+                  -Wno-sign-compare -Wno-unused-variable
+
+ifneq ($(DEBUG_TYPE),off)
+CDBGFLAGS      := -g
+endif
+
+ifeq ($(DEBUG_TYPE),noopt)
+COPTFLAGS      := -O0
+else
+COPTFLAGS      := -Ofast
+endif
+
+# Flags specific to optimized kernels.
+CKOPTFLAGS     := $(COPTFLAGS)
+CKVECFLAGS     :=
+
+# Flags specific to reference kernels.
+CROPTFLAGS     := $(CKOPTFLAGS)
+CRVECFLAGS     := $(CKVECFLAGS)
+
+# Store all of the variables here to new variables containing the
+# configuration name.
+$(eval $(call store-make-defs,$(THIS_CONFIG)))
+
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index 4bacc5d63c..8e5a57d6cf 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -87,11 +87,11 @@ void bli_cntx_init_template( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],     0,     0,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     0,     0,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,     0,     0,   128 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],     0,     0,     0,   256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0,     0,     0,  4096 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    -1,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    -1,    -1,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,    -1,    -1,   128 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,    -1,    -1,   256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,    -1,    -1,  4096 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 88f39c3d13..0bd4ed3441 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -1,6 +1,6 @@
 #
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -35,7 +35,7 @@
 
 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
-THIS_CONFIG    := zen3 
+THIS_CONFIG    := zen3
 #CONFIGS_INCL   += $(THIS_CONFIG)
 
 #
@@ -65,8 +65,8 @@ endif
 # they make explicit use of the rbp register.
 CKOPTFLAGS         := $(COPTFLAGS) -fomit-frame-pointer
 CROPTFLAGS         := $(CKOPTFLAGS)
-CKVECFLAGS         := -mavx2 -mfma -mfpmath=sse
-CRVECFLAGS         := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
+CKVECFLAGS         := -mavx2 -mfma
+CRVECFLAGS         := $(CKVECFLAGS)
 ifeq ($(CC_VENDOR),gcc)
   ifeq ($(GCC_OT_9_1_0),yes)  # gcc versions older than 9.1.
     CVECFLAGS_VER  := -march=znver1 -mno-avx256-split-unaligned-store
@@ -77,6 +77,8 @@ ifeq ($(CC_VENDOR),gcc)
     CVECFLAGS_VER  := -march=znver3
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),clang)
   ifeq ($(CLANG_OT_9_0_0),yes)  # clang versions older than 9.0.
@@ -92,6 +94,8 @@ ifeq ($(CC_VENDOR),clang)
   endif
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
 else
 ifeq ($(CC_VENDOR),aocc)
   ifeq ($(AOCC_OT_2_0_0),yes)   # aocc versions older than 2.0.
@@ -103,8 +107,14 @@ ifeq ($(CC_VENDOR),aocc)
     CVECFLAGS_VER  := -march=znver3
   endif
   endif
+  CKVECFLAGS       += -mfpmath=sse
+  CRVECFLAGS       += -funsafe-math-optimizations -ffp-contract=fast
+ifeq ($(CC_VENDOR),nvc)
+  CVECFLAGS_VER    := -march=znver3
+  CRVECFLAGS       += -fast
 else
-  $(error gcc, clang, or aocc is required for this configuration.)
+  $(error gcc, clang, nvc or aocc is required for this configuration.)
+endif
 endif
 endif
 endif
@@ -114,4 +124,3 @@ CRVECFLAGS         += $(CVECFLAGS_VER)
 # Store all of the variables here to new variables containing the
 # configuration name.
 $(eval $(call store-make-defs,$(THIS_CONFIG)))
-
diff --git a/config_registry b/config_registry
index 61482ae7b8..09a33bc9a5 100644
--- a/config_registry
+++ b/config_registry
@@ -59,5 +59,8 @@ rv64i:       rv64i/rvi
 rv32iv:      rv32iv/rviv
 rv64iv:      rv64iv/rviv
 
+# SiFive architectures.
+sifive_x280: sifive_x280
+
 # Generic architectures.
 generic:     generic
diff --git a/docs/Multithreading.md b/docs/Multithreading.md
index 6f2ef49c55..d8f8b13f40 100644
--- a/docs/Multithreading.md
+++ b/docs/Multithreading.md
@@ -101,7 +101,7 @@ The `cores` value is most appropriate for BLIS since we usually want to ignore h
 
 Setting these two variables is often enough. However, it obviously does not offer the level of control that `GOMP_CPU_AFFINITY` does. Sometimes, it takes some experimentation to determine whether a particular mapping is performing as expected. If multithreaded performance on eight cores is only twice what it is observed of single-threaded performance, the affinity mapping may be to blame. But if performance is six or seven times higher than sequential execution, then the mapping you chose is probably working fine.
 
-Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity/) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
+Unfortunately, the topic of thread-to-core affinity is well beyond the scope of this document. (A web search will uncover many [great resources](https://web.archive.org/web/20190130102805/http://www.nersc.gov/users/software/programming-models/openmp/process-and-thread-affinity) discussing the use of [GOMP_CPU_AFFINITY](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fCPU_005fAFFINITY.html) and [OMP_PROC_BIND](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fPROC_005fBIND.html#OMP_005fPROC_005fBIND).) It's up to the user to determine an appropriate affinity mapping, and then choose your preferred method of expressing that mapping to the OpenMP implementation.
 
 
 # Specifying multithreading
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
index 7f74010451..f38710ea84 100644
--- a/frame/1m/packm/bli_packm_cntl.c
+++ b/frame/1m/packm/bli_packm_cntl.c
@@ -37,7 +37,7 @@
 
 BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
      (
-       pool_t*   pool,
+       pool_t*   sba_pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
@@ -57,7 +57,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	#endif
 
 	// Allocate a packm_params_t struct.
-	params = bli_sba_acquire( pool, sizeof( packm_params_t ) );
+	params = bli_sba_acquire( sba_pool, sizeof( packm_params_t ) );
 
 	// Initialize the packm_params_t struct.
 	params->size              = sizeof( packm_params_t );
@@ -79,7 +79,7 @@ BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
 	// sync with the cntl_t tree.
 	cntl = bli_cntl_create_node
 	(
-	  pool,
+	  sba_pool,
 	  BLIS_NOID,
 	  BLIS_NO_PART,
 	  var_func,
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 8a43f711d1..a94a465b25 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -85,7 +85,7 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
 
 cntl_t* bli_packm_cntl_create_node
      (
-       pool_t*   pool,
+       pool_t*   sba_pool,
        void_fp   var_func,
        bszid_t   bmid_m,
        bszid_t   bmid_n,
diff --git a/frame/3/bli_l3_decor.c b/frame/3/bli_l3_decor.c
index 88ec5def91..dc1d3bb1ba 100644
--- a/frame/3/bli_l3_decor.c
+++ b/frame/3/bli_l3_decor.c
@@ -89,7 +89,7 @@ static void bli_l3_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, const
 
 	// Create a default control tree for the operation, if needed.
 	cntl_t* cntl_use;
-	pool_t* sba_pool = bli_apool_array_elem( tid, array );
+	pool_t* sba_pool = bli_sba_array_elem( tid, array );
 	bli_l3_cntl_create_if( family, schema_a, schema_b,
 	                       &a_t, &b_t, &c_t, sba_pool, NULL, &cntl_use );
 
diff --git a/frame/3/bli_l3_sup_decor.c b/frame/3/bli_l3_sup_decor.c
index 7cda8bdcaa..d420559b58 100644
--- a/frame/3/bli_l3_sup_decor.c
+++ b/frame/3/bli_l3_sup_decor.c
@@ -69,7 +69,7 @@ static void bli_l3_sup_thread_decorator_entry( thrcomm_t* gl_comm, dim_t tid, co
 	bli_l3_thread_decorator_thread_check( gl_comm, rntm );
 
 	// Create the root node of the thread's thrinfo_t structure.
-	pool_t*    pool   = bli_apool_array_elem( tid, array );
+	pool_t*    pool   = bli_sba_array_elem( tid, array );
 	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
 
 	func
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 95d2a54398..5f3d39d391 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -44,16 +44,14 @@ thrinfo_t* bli_l3_thrinfo_create
        const cntl_t*     cntl
      )
 {
-	pool_t* pool = NULL;
-	if ( array != NULL )
-		pool = bli_apool_array_elem( id, array );
+	pool_t* sba_pool = bli_sba_array_elem( id, array );
 
 	// Create the root thrinfo_t node.
 	thrinfo_t* root = bli_thrinfo_create_root
 	(
 	  gl_comm,
 	  id,
-	  pool,
+	  sba_pool,
 	  bli_pba_query()
 	);
 
@@ -123,7 +121,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
      (
              dim_t      id,
              thrcomm_t* gl_comm,
-             pool_t*    pool,
+             pool_t*    sba_pool,
        const rntm_t*    rntm
      )
 {
@@ -132,7 +130,7 @@ thrinfo_t* bli_l3_sup_thrinfo_create
 	(
 	  gl_comm,
 	  id,
-	  pool,
+	  sba_pool,
 	  bli_pba_query()
 	);
 
@@ -176,10 +174,10 @@ void bli_l3_sup_thrinfo_update
              thrinfo_t** root
      )
 {
-	thrcomm_t* gl_comm = bli_thrinfo_comm( *root );
-	dim_t      tid     = bli_thrinfo_thread_id( *root );
-	pool_t*    pool    = bli_thrinfo_sba_pool( *root );
-	dim_t      nt      = bli_thrinfo_num_threads( *root );
+	thrcomm_t* gl_comm  = bli_thrinfo_comm( *root );
+	dim_t      tid      = bli_thrinfo_thread_id( *root );
+	pool_t*    sba_pool = bli_thrinfo_sba_pool( *root );
+	dim_t      nt       = bli_thrinfo_num_threads( *root );
 
 	// Return early in single-threaded execution
 	// since the thread control tree may not have been
@@ -187,7 +185,7 @@ void bli_l3_sup_thrinfo_update
 	if ( nt == 1 ) return;
 
 	bli_thrinfo_free( *root );
-	*root = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
+	*root = bli_l3_sup_thrinfo_create( tid, gl_comm, sba_pool, rntm );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index d06f79207b..c11171a27c 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -56,7 +56,7 @@ BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool )
 	return &(apool->pool);
 }
 
-BLIS_INLINE  bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
+BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
 {
 	return &(apool->mutex);
 }
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index af8f671859..a53a2fb64c 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -286,6 +286,11 @@ arch_t bli_arch_query_id_impl( void )
 		id = BLIS_ARCH_RV64IV;
 		#endif
 
+		// SiFive microarchitectures.
+		#ifdef BLIS_FAMILY_SIFIVE_X280
+		id = BLIS_ARCH_SIFIVE_X280;
+		#endif
+
 		// Generic microarchitecture.
 		#ifdef BLIS_FAMILY_GENERIC
 		id = BLIS_ARCH_GENERIC;
@@ -351,6 +356,8 @@ static const char* config_name[ BLIS_NUM_ARCHS ] =
     "rv32iv",
     "rv64iv",
 
+    "sifive_x280",
+
     "generic"
 };
 
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index d91c0542d8..7f1db27066 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -84,14 +84,15 @@ BLIS_INLINE void bli_blksz_copy
 	*b_dst = *b_src;
 }
 
-BLIS_INLINE void bli_blksz_copy_if_pos
+BLIS_INLINE void bli_blksz_copy_if_nonneg
      (
        const blksz_t* b_src,
              blksz_t* b_dst
      )
 {
-	// Copy the blocksize values over to b_dst one-by-one so that
-	// we can skip the ones that are non-positive.
+	// Copy the blocksize values over to b_dst one-by-one. Note that we
+	// only copy valuse that are zero or positive (and skip copying any
+	// values that are negative).
 
 	const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT,    b_src );
 	const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE,   b_src );
@@ -103,15 +104,15 @@ BLIS_INLINE void bli_blksz_copy_if_pos
 	const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src );
 	const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src );
 
-	if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT,    b_dst );
-	if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE,   b_dst );
-	if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
-	if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
+	if ( v_s >= 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT,    b_dst );
+	if ( v_d >= 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE,   b_dst );
+	if ( v_c >= 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
+	if ( v_z >= 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
 
-	if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT,    b_dst );
-	if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE,   b_dst );
-	if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
-	if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
+	if ( e_s >= 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT,    b_dst );
+	if ( e_d >= 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE,   b_dst );
+	if ( e_c >= 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
+	if ( e_z >= 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
 }
 
 BLIS_INLINE void bli_blksz_copy_def_dt
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index daa092ba72..bd688f85ad 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -37,7 +37,7 @@
 
 cntl_t* bli_cntl_create_node
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        opid_t  family,
        bszid_t bszid,
        void_fp var_func,
@@ -52,7 +52,7 @@ cntl_t* bli_cntl_create_node
 	#endif
 
 	// Allocate the cntl_t struct.
-	cntl = bli_sba_acquire( pool, sizeof( cntl_t ) );
+	cntl = bli_sba_acquire( sba_pool, sizeof( cntl_t ) );
 
 	bli_cntl_set_family( family, cntl );
 	bli_cntl_set_bszid( bszid, cntl );
@@ -66,7 +66,7 @@ cntl_t* bli_cntl_create_node
 
 void bli_cntl_free_node
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        cntl_t* cntl
      )
 {
@@ -74,7 +74,7 @@ void bli_cntl_free_node
 	printf( "bli_cntl_free_node(): " );
 	#endif
 
-	bli_sba_release( pool, cntl );
+	bli_sba_release( sba_pool, cntl );
 }
 
 void bli_cntl_clear_node
@@ -94,7 +94,7 @@ void bli_cntl_clear_node
 
 void bli_cntl_free
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        cntl_t* cntl
      )
 {
@@ -110,7 +110,7 @@ void bli_cntl_free
 	{
 		// Recursively free all memory associated with the sub-prenode and its
 		// children.
-		bli_cntl_free( pool, cntl_sub_prenode );
+		bli_cntl_free( sba_pool, cntl_sub_prenode );
 	}
 
 	// Only recurse into the child node if it exists.
@@ -118,7 +118,7 @@ void bli_cntl_free
 	{
 		// Recursively free all memory associated with the sub-node and its
 		// children.
-		bli_cntl_free( pool, cntl_sub_node );
+		bli_cntl_free( sba_pool, cntl_sub_node );
 	}
 
 	// Free the current node's params field, if it is non-NULL.
@@ -128,18 +128,18 @@ void bli_cntl_free
 		printf( "bli_cntl_free_w_thrinfo(): " );
 		#endif
 
-		bli_sba_release( pool, cntl_params );
+		bli_sba_release( sba_pool, cntl_params );
 	}
 
 	// Free the current node.
-	bli_cntl_free_node( pool, cntl );
+	bli_cntl_free_node( sba_pool, cntl );
 }
 
 // -----------------------------------------------------------------------------
 
 cntl_t* bli_cntl_copy
      (
-             pool_t* pool,
+             pool_t* sba_pool,
        const cntl_t* cntl
      )
 {
@@ -149,7 +149,7 @@ cntl_t* bli_cntl_copy
 	// field.
 	cntl_t* cntl_copy = bli_cntl_create_node
 	(
-	  pool,
+	  sba_pool,
 	  bli_cntl_family( cntl ),
 	  bli_cntl_bszid( cntl ),
 	  bli_cntl_var_func( cntl ),
@@ -165,7 +165,7 @@ cntl_t* bli_cntl_copy
 		// struct.
 		uint64_t params_size = bli_cntl_params_size( cntl );
 		void*    params_orig = bli_cntl_params( cntl );
-		void*    params_copy = bli_sba_acquire( pool, ( size_t )params_size );
+		void*    params_copy = bli_sba_acquire( sba_pool, ( size_t )params_size );
 
 		// Copy the original params struct to the new memory region.
 		memcpy( params_copy, params_orig, params_size );
@@ -180,7 +180,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_prenode_copy = bli_cntl_copy
 		(
-		  pool,
+		  sba_pool,
 		  bli_cntl_sub_prenode( cntl )
 		);
 
@@ -194,7 +194,7 @@ cntl_t* bli_cntl_copy
 	{
 		cntl_t* sub_node_copy = bli_cntl_copy
 		(
-		  pool,
+		  sba_pool,
 		  bli_cntl_sub_node( cntl )
 		);
 
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 8c6cafc13c..4635c11f4a 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -100,7 +100,7 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 		//cntx_blkszs[ bs_id ] = *blksz;
 		//bli_blksz_copy( blksz, cntx_blksz );
 		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-		bli_blksz_copy_if_pos( blksz, cntx_blksz );
+		bli_blksz_copy_if_nonneg( blksz, cntx_blksz );
 
 		// Copy the blocksize multiple id into the context.
 		cntx_bmults[ bs_id ] = bm_id;
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 7b9ab3d7c2..a21aa12446 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -259,6 +259,14 @@ int bli_gks_init( void )
 		                                              bli_cntx_init_rv64iv_ind );
 #endif
 
+		// -- SiFive architectures ----------------------------------------------
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+		bli_gks_register_cntx( BLIS_ARCH_SIFIVE_X280, bli_cntx_init_sifive_x280,
+		                                              bli_cntx_init_sifive_x280_ref,
+		                                              bli_cntx_init_sifive_x280_ind );
+#endif
+
 		// -- Generic architectures --------------------------------------------
 
 #ifdef BLIS_CONFIG_GENERIC
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 5123c5b4b2..54da4c7d91 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -47,17 +47,21 @@ apool_t* bli_sba_query( void )
 
 void bli_sba_init( void )
 {
+#ifdef BLIS_ENABLE_SBA_POOLS
 	bli_apool_init( &sba );
+#endif
 }
 
 void bli_sba_finalize( void )
 {
+#ifdef BLIS_ENABLE_SBA_POOLS
 	bli_apool_finalize( &sba );
+#endif
 }
 
 void* bli_sba_acquire
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        siz_t   req_size
      )
 {
@@ -74,7 +78,7 @@ void* bli_sba_acquire
 	// is convenient to not have to checkout an array_t from the sba, and it
 	// does no harm since the malloc() happens outside of the region that
 	// would be timed.)
-	if ( pool == NULL )
+	if ( sba_pool == NULL )
 	{
 		block = bli_malloc_intl( req_size, &r_val );
 	}
@@ -84,10 +88,10 @@ void* bli_sba_acquire
 
 		// Query the block_size of the pool_t so that we can request the exact
 		// size present.
-		const siz_t block_size = bli_pool_block_size( pool );
+		const siz_t block_size = bli_pool_block_size( sba_pool );
 
 		// Sanity check: Make sure the requested size is no larger than the
-		// block_size field of the pool.
+		// block_size field of the sba pool.
 		if ( block_size < req_size )
 		{
 			printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
@@ -96,7 +100,7 @@ void* bli_sba_acquire
 		}
 
 		// Check out a block using the block_size queried above.
-		bli_pool_checkout_block( block_size, &pblk, pool );
+		bli_pool_checkout_block( block_size, &pblk, sba_pool );
 
 		// The block address is stored within the pblk_t.
 		block = bli_pblk_buf( &pblk );
@@ -114,13 +118,13 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       pool_t* pool,
+       pool_t* sba_pool,
        void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
 
-	if ( pool == NULL )
+	if ( sba_pool == NULL )
 	{
 		bli_free_intl( block );
 	}
@@ -132,17 +136,17 @@ void bli_sba_release
 		// for this particular application of the pool_t (that is, the "leaf"
 		// component of the sba), but it seems like good housekeeping to maintain
 		// the block_size field of the pblk_t in case its ever needed/read.
-		const siz_t block_size = bli_pool_block_size( pool );
+		const siz_t block_size = bli_pool_block_size( sba_pool );
 
 		// Embed the block's memory address into a pblk_t, along with the
-		// block_size queried from the pool.
+		// block_size queried from the sba pool.
 		bli_pblk_set_buf( block, &pblk );
 		bli_pblk_set_block_size( block_size, &pblk );
 
 		// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
 		// a local variable since its contents are copied into the pool's internal
 		// data structure--an array of pblk_t.)
-		bli_pool_checkin_block( &pblk, pool );
+		bli_pool_checkin_block( &pblk, sba_pool );
 	}
 
 #else
@@ -176,3 +180,17 @@ void bli_sba_checkin_array
 #endif
 }
 
+pool_t* bli_sba_array_elem
+     (
+       siz_t    index,
+       array_t* array
+     )
+{
+#ifdef BLIS_ENABLE_SBA_POOLS
+	if ( array != NULL ) return bli_apool_array_elem( index, array );
+	else                 return NULL;
+#else
+	return NULL;
+#endif
+}
+
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index 8d9db844f9..92e53e7b30 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -42,6 +42,18 @@ apool_t* bli_sba_query( void );
 void bli_sba_init( void );
 void bli_sba_finalize( void );
 
+void* bli_sba_acquire
+     (
+       pool_t* sba_pool,
+       siz_t   req_size
+     );
+
+void bli_sba_release
+     (
+       pool_t* sba_pool,
+       void*   block
+     );
+
 array_t* bli_sba_checkout_array
      (
        siz_t n_threads
@@ -52,16 +64,10 @@ void bli_sba_checkin_array
        array_t* array
      );
 
-void* bli_sba_acquire
+pool_t* bli_sba_array_elem
      (
-       pool_t* pool,
-       siz_t   req_size
-     );
-
-void bli_sba_release
-     (
-       pool_t* pool,
-       void*   block
+       siz_t    index,
+       array_t* array
      );
 
 #endif
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index c5b5ebda37..8923acdc48 100644
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -38,8 +38,8 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
-#undef  GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
      ( \
@@ -110,6 +110,6 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( symv, symv )
+INSERT_GENTFUNC_BLAS( symv, symv )
 #endif
 
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 2f493a9d97..4f453a7a32 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -37,8 +37,8 @@
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
-#undef  GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      ( \
@@ -52,7 +52,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( symv )
+INSERT_GENTPROT_BLAS( symv )
 #endif
 
 #endif
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 6732a75cf2..91dc99b599 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -38,8 +38,8 @@
 //
 // Define BLAS-to-BLIS interfaces.
 //
-#undef  GENTFUNCRO
-#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
+#undef  GENTFUNC
+#define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
 void PASTEF77(ch,blasname) \
      ( \
@@ -101,6 +101,6 @@ void PASTEF77(ch,blasname) \
 }
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTFUNCRO_BLAS( syr, syr )
+INSERT_GENTFUNC_BLAS( syr, syr )
 #endif
 
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 662d07328f..7f3eeb3679 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -37,8 +37,8 @@
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
-#undef  GENTPROTRO
-#define GENTPROTRO( ftype, ch, blasname ) \
+#undef  GENTPROT
+#define GENTPROT( ftype, ch, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      ( \
@@ -50,7 +50,7 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
      );
 
 #ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROTRO_BLAS( syr )
+INSERT_GENTPROT_BLAS( syr )
 #endif
 
 #endif
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index c79769bc05..0dbd720d21 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Field G. Van Zee
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -358,5 +359,484 @@
     return 0;
 } /* zdrot_ */
 
+
+/* crot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s)
+{
+    /* System generated locals */
+    bla_integer i__1, i__2, i__3, i__4;
+    bla_scomplex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    bla_integer i__, ix, iy;
+    bla_scomplex stemp;
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+#if 0
+	q__2.r = *c__ * cx[i__2].r;
+	q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r
+	q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r
+	stemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r
+	q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = ix;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i
+	q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r;
+	q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r;
+	cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_csets
+	(
+	  *c__ * bli_creal(cx[i__2]),
+	  *c__ * bli_cimag(cx[i__2]),
+	  q__2
+	);
+	i__3 = iy;
+	bli_csets
+	(
+	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) + bli_creal(q__3),
+	  bli_cimag(q__2) + bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  stemp
+	);
+	i__2 = iy;
+	i__3 = iy;
+	bli_csets
+	(
+	  *c__ * bli_creal(cy[i__3]),
+	  *c__ * bli_cimag(cy[i__3]),
+	  q__2
+	);
+	bla_r_cnjg(&q__4, s);
+	i__4 = ix;
+	bli_csets
+	(
+	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) - bli_creal(q__3),
+	  bli_cimag(q__2) - bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  cy[i__2]
+	);
+	i__2 = ix;
+	bli_csets
+	(
+	  bli_creal(stemp),
+	  bli_cimag(stemp),
+	  cx[i__2]
+	);
+#endif
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+#if 0
+	q__2.r = *c__ * cx[i__2].r;
+	q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	q__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r;
+	q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r;
+	stemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r;
+	q__2.i = *c__ * cy[i__3].i;
+	bla_r_cnjg(&q__4, s);
+	i__4 = i__;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i;
+	q__3.i = q__4.r * cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r;
+	q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r;
+	cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_csets
+	(
+	  *c__ * bli_creal(cx[i__2]),
+	  *c__ * bli_cimag(cx[i__2]),
+	  q__2
+	);
+	i__3 = i__;
+	bli_csets
+	(
+	  bli_creal(*s) * bli_creal(cy[i__3]) - bli_cimag(*s) * bli_cimag(cy[i__3]),
+	  bli_creal(*s) * bli_cimag(cy[i__3]) + bli_cimag(*s) * bli_creal(cy[i__3]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) + bli_creal(q__3),
+	  bli_cimag(q__2) + bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  stemp
+	);
+	i__2 = i__;
+	i__3 = i__;
+	bli_csets
+	(
+	  *c__ * bli_creal(cy[i__3]),
+	  *c__ * bli_cimag(cy[i__3]),
+	  q__2
+	);
+	bla_r_cnjg(&q__4, s);
+	i__4 = i__;
+	bli_csets
+	(
+	  bli_creal(q__4) * bli_creal(cx[i__4]) - bli_cimag(q__4) * bli_cimag(cx[i__4]),
+	  bli_creal(q__4) * bli_cimag(cx[i__4]) + bli_cimag(q__4) * bli_creal(cx[i__4]),
+	  q__3
+	);
+	bli_csets
+	(
+	  bli_creal(q__2) - bli_creal(q__3),
+	  bli_cimag(q__2) - bli_cimag(q__3),
+	  q__1
+	);
+	bli_csets
+	(
+	  bli_creal(q__1),
+	  bli_cimag(q__1),
+	  cy[i__2]
+	);
+	i__2 = i__;
+	bli_csets
+	(
+	  bli_creal(stemp),
+	  bli_cimag(stemp),
+	  cx[i__2]
+	);
+#endif
+/* L30: */
+    }
+    return 0;
+} /* crot_ */
+
+
+/* zrot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+/* Subroutine */ int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s)
+{
+    /* System generated locals */
+    bla_integer i__1, i__2, i__3, i__4;
+    bla_dcomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    bla_integer i__, ix, iy;
+    bla_dcomplex stemp;
+
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+#if 0
+	z__2.r = *c__ * cx[i__2].r;
+	z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r
+	z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r
+	stemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r
+	z__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&z__4, s);
+	i__4 = ix;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i
+	z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r;
+	z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r;
+	cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cx[i__2]),
+	  *c__ * bli_zimag(cx[i__2]),
+	  z__2
+	);
+	i__3 = iy;
+	bli_zsets
+	(
+	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) + bli_zreal(z__3),
+	  bli_zimag(z__2) + bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  stemp
+	);
+	i__2 = iy;
+	i__3 = iy;
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cy[i__3]),
+	  *c__ * bli_zimag(cy[i__3]),
+	  z__2
+	);
+	bla_d_cnjg(&z__4, s);
+	i__4 = ix;
+	bli_zsets
+	(
+	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) - bli_zreal(z__3),
+	  bli_zimag(z__2) - bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  cy[i__2]
+	);
+	i__2 = ix;
+	bli_zsets
+	(
+	  bli_zreal(stemp),
+	  bli_zimag(stemp),
+	  cx[i__2]
+	);
+#endif
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+#if 0
+	z__2.r = *c__ * cx[i__2].r;
+	z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i;
+	z__3.i = s->r * cy[i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r;
+	z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r;
+	stemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r;
+	z__2.i = *c__ * cy[i__3].i;
+	bla_d_cnjg(&z__4, s);
+	i__4 = i__;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i;
+	z__3.i = z__4.r * cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r;
+	z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r;
+	cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r;
+	cx[i__2].i = stemp.i;
+#else
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cx[i__2]),
+	  *c__ * bli_zimag(cx[i__2]),
+	  z__2
+	);
+	i__3 = i__;
+	bli_zsets
+	(
+	  bli_zreal(*s) * bli_zreal(cy[i__3]) - bli_zimag(*s) * bli_zimag(cy[i__3]),
+	  bli_zreal(*s) * bli_zimag(cy[i__3]) + bli_zimag(*s) * bli_zreal(cy[i__3]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) + bli_zreal(z__3),
+	  bli_zimag(z__2) + bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  stemp
+	);
+	i__2 = i__;
+	i__3 = i__;
+	bli_zsets
+	(
+	  *c__ * bli_zreal(cy[i__3]),
+	  *c__ * bli_zimag(cy[i__3]),
+	  z__2
+	);
+	bla_d_cnjg(&z__4, s);
+	i__4 = i__;
+	bli_zsets
+	(
+	  bli_zreal(z__4) * bli_zreal(cx[i__4]) - bli_zimag(z__4) * bli_zimag(cx[i__4]),
+	  bli_zreal(z__4) * bli_zimag(cx[i__4]) + bli_zimag(z__4) * bli_zreal(cx[i__4]),
+	  z__3
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__2) - bli_zreal(z__3),
+	  bli_zimag(z__2) - bli_zimag(z__3),
+	  z__1
+	);
+	bli_zsets
+	(
+	  bli_zreal(z__1),
+	  bli_zimag(z__1),
+	  cy[i__2]
+	);
+	i__2 = i__;
+	bli_zsets
+	(
+	  bli_zreal(stemp),
+	  bli_zimag(stemp),
+	  cx[i__2]
+	);
+#endif
+/* L30: */
+    }
+    return 0;
+} /* zrot_ */
+
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index ca4a4f9ac1..4e6aead4a8 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -38,5 +38,7 @@ BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const b
 BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
 BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
 BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
+BLIS_EXPORT_BLAS int PASTEF77(c,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_scomplex *s);
+BLIS_EXPORT_BLAS int PASTEF77(z,rot)(const bla_integer *n, bla_dcomplex *cx, const bla_integer *incx, bla_dcomplex *cy, const bla_integer *incy, const bla_double *c__, const bla_dcomplex *s);
 
 #endif
diff --git a/frame/compat/f2c/other/crot.c b/frame/compat/f2c/other/crot.c
new file mode 100644
index 0000000000..e3e1282f4e
--- /dev/null
+++ b/frame/compat/f2c/other/crot.c
@@ -0,0 +1,227 @@
+/* crot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors. 
+*/
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CROT + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crot.f"
+> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/*       .. Scalar Arguments .. */
+/*       INTEGER            INCX, INCY, N */
+/*       REAL               C */
+/*       COMPLEX            S */
+/*       .. */
+/*       .. Array Arguments .. */
+/*       COMPLEX            CX( * ), CY( * ) */
+/*       .. */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CROT   applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* >          CX is COMPLEX array, dimension (N) */
+/* >          On input, the vector X. */
+/* >          On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of CX.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* >          CY is COMPLEX array, dimension (N) */
+/* >          On input, the vector Y. */
+/* >          On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* >          INCY is INTEGER */
+/* >          The increment between successive values of CY.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* >          C is REAL */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* >          S is COMPLEX */
+/* >          C and S define a rotation */
+/* >             [  C          S  ] */
+/* >             [ -conjg(S)   C  ] */
+/* >          where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crot_(integer *n, complex *cx, integer *incx, complex *
+	cy, integer *incy, real *c__, complex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Builtin functions */
+    void r_cnjg(complex *, complex *);
+
+    /* Local variables */
+    integer i__, ix, iy;
+    complex stemp;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/* ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = ix;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r * 
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	q__2.r = *c__ * cx[i__2].r, q__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	q__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, q__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	stemp.r = q__1.r, stemp.i = q__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	q__2.r = *c__ * cy[i__3].r, q__2.i = *c__ * cy[i__3].i;
+	r_cnjg(&q__4, s);
+	i__4 = i__;
+	q__3.r = q__4.r * cx[i__4].r - q__4.i * cx[i__4].i, q__3.i = q__4.r * 
+		cx[i__4].i + q__4.i * cx[i__4].r;
+	q__1.r = q__2.r - q__3.r, q__1.i = q__2.i - q__3.i;
+	cy[i__2].r = q__1.r, cy[i__2].i = q__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* crot_ */
+
diff --git a/frame/compat/f2c/other/crot.f b/frame/compat/f2c/other/crot.f
new file mode 100644
index 0000000000..6dc771506f
--- /dev/null
+++ b/frame/compat/f2c/other/crot.f
@@ -0,0 +1,159 @@
+*> \brief \b CROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CROT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crot.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crot.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crot.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, INCY, N
+*       REAL               C
+*       COMPLEX            S
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            CX( * ), CY( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CROT   applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*>          CX is COMPLEX array, dimension (N)
+*>          On input, the vector X.
+*>          On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of CX.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*>          CY is COMPLEX array, dimension (N)
+*>          On input, the vector Y.
+*>          On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*>          INCY is INTEGER
+*>          The increment between successive values of CY.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is REAL
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*>          S is COMPLEX
+*>          C and S define a rotation
+*>             [  C          S  ]
+*>             [ -conjg(S)   C  ]
+*>          where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CROT( N, CX, INCX, CY, INCY, C, S )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, INCY, N
+      REAL               C
+      COMPLEX            S
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            CX( * ), CY( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IX, IY
+      COMPLEX            STEMP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 )
+     $   RETURN
+      IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+     $   GO TO 20
+*
+*     Code for unequal increments or equal increments not equal to 1
+*
+      IX = 1
+      IY = 1
+      IF( INCX.LT.0 )
+     $   IX = ( -N+1 )*INCX + 1
+      IF( INCY.LT.0 )
+     $   IY = ( -N+1 )*INCY + 1
+      DO 10 I = 1, N
+         STEMP = C*CX( IX ) + S*CY( IY )
+         CY( IY ) = C*CY( IY ) - CONJG( S )*CX( IX )
+         CX( IX ) = STEMP
+         IX = IX + INCX
+         IY = IY + INCY
+   10 CONTINUE
+      RETURN
+*
+*     Code for both increments equal to 1
+*
+   20 CONTINUE
+      DO 30 I = 1, N
+         STEMP = C*CX( I ) + S*CY( I )
+         CY( I ) = C*CY( I ) - CONJG( S )*CX( I )
+         CX( I ) = STEMP
+   30 CONTINUE
+      RETURN
+      END
diff --git a/frame/compat/f2c/other/zrot.c b/frame/compat/f2c/other/zrot.c
new file mode 100644
index 0000000000..0706f8b251
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.c
@@ -0,0 +1,227 @@
+/* zrot.f -- translated by f2c (version 20100827).
+   You must link the resulting object file with libf2c:
+	on Microsoft Windows system, link with libf2c.lib;
+	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+	or, if you install libf2c.a in a standard place, with -lf2c -lm
+	-- in that order, at the end of the command line, as in
+		cc *.o -lf2c -lm
+	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+		http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* > \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors. 
+*/
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZROT + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zrot.f"
+> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S ) */
+
+/*       .. Scalar Arguments .. */
+/*       INTEGER            INCX, INCY, N */
+/*       DOUBLE PRECISION   C */
+/*       COMPLEX*16         S */
+/*       .. */
+/*       .. Array Arguments .. */
+/*       COMPLEX*16         CX( * ), CY( * ) */
+/*       .. */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZROT   applies a plane rotation, where the cos (C) is real and the */
+/* > sin (S) is complex, and the vectors CX and CY are complex. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of elements in the vectors CX and CY. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CX */
+/* > \verbatim */
+/* >          CX is COMPLEX*16 array, dimension (N) */
+/* >          On input, the vector X. */
+/* >          On output, CX is overwritten with C*X + S*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of CX.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CY */
+/* > \verbatim */
+/* >          CY is COMPLEX*16 array, dimension (N) */
+/* >          On input, the vector Y. */
+/* >          On output, CY is overwritten with -CONJG(S)*X + C*Y. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCY */
+/* > \verbatim */
+/* >          INCY is INTEGER */
+/* >          The increment between successive values of CY.  INCX <> 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] C */
+/* > \verbatim */
+/* >          C is DOUBLE PRECISION */
+/* > \endverbatim */
+/* > */
+/* > \param[in] S */
+/* > \verbatim */
+/* >          S is COMPLEX*16 */
+/* >          C and S define a rotation */
+/* >             [  C          S  ] */
+/* >             [ -conjg(S)   C  ] */
+/* >          where C*C + S*CONJG(S) = 1.0. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrot_(integer *n, doublecomplex *cx, integer *incx, 
+	doublecomplex *cy, integer *incy, doublereal *c__, doublecomplex *s)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Builtin functions */
+    void d_cnjg(doublecomplex *, doublecomplex *);
+
+    /* Local variables */
+    integer i__, ix, iy;
+    doublecomplex stemp;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+/*     .. Scalar Arguments .. */
+/*     .. */
+/*     .. Array Arguments .. */
+/*     .. */
+
+/* ===================================================================== */
+
+/*     .. Local Scalars .. */
+/*     .. */
+/*     .. Intrinsic Functions .. */
+/*     .. */
+/*     .. Executable Statements .. */
+
+    /* Parameter adjustments */
+    --cy;
+    --cx;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+    if (*incx == 1 && *incy == 1) {
+	goto L20;
+    }
+
+/*     Code for unequal increments or equal increments not equal to 1 */
+
+    ix = 1;
+    iy = 1;
+    if (*incx < 0) {
+	ix = (-(*n) + 1) * *incx + 1;
+    }
+    if (*incy < 0) {
+	iy = (-(*n) + 1) * *incy + 1;
+    }
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = ix;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = iy;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = iy;
+	i__3 = iy;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = ix;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r * 
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = ix;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+	ix += *incx;
+	iy += *incy;
+/* L10: */
+    }
+    return 0;
+
+/*     Code for both increments equal to 1 */
+
+L20:
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	z__2.r = *c__ * cx[i__2].r, z__2.i = *c__ * cx[i__2].i;
+	i__3 = i__;
+	z__3.r = s->r * cy[i__3].r - s->i * cy[i__3].i, z__3.i = s->r * cy[
+		i__3].i + s->i * cy[i__3].r;
+	z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	stemp.r = z__1.r, stemp.i = z__1.i;
+	i__2 = i__;
+	i__3 = i__;
+	z__2.r = *c__ * cy[i__3].r, z__2.i = *c__ * cy[i__3].i;
+	d_cnjg(&z__4, s);
+	i__4 = i__;
+	z__3.r = z__4.r * cx[i__4].r - z__4.i * cx[i__4].i, z__3.i = z__4.r * 
+		cx[i__4].i + z__4.i * cx[i__4].r;
+	z__1.r = z__2.r - z__3.r, z__1.i = z__2.i - z__3.i;
+	cy[i__2].r = z__1.r, cy[i__2].i = z__1.i;
+	i__2 = i__;
+	cx[i__2].r = stemp.r, cx[i__2].i = stemp.i;
+/* L30: */
+    }
+    return 0;
+} /* zrot_ */
+
diff --git a/frame/compat/f2c/other/zrot.f b/frame/compat/f2c/other/zrot.f
new file mode 100644
index 0000000000..28fc8ec1de
--- /dev/null
+++ b/frame/compat/f2c/other/zrot.f
@@ -0,0 +1,159 @@
+*> \brief \b ZROT applies a plane rotation with real cosine and complex sine to a pair of complex vectors.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZROT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zrot.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zrot.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zrot.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, INCY, N
+*       DOUBLE PRECISION   C
+*       COMPLEX*16         S
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         CX( * ), CY( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZROT   applies a plane rotation, where the cos (C) is real and the
+*> sin (S) is complex, and the vectors CX and CY are complex.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of elements in the vectors CX and CY.
+*> \endverbatim
+*>
+*> \param[in,out] CX
+*> \verbatim
+*>          CX is COMPLEX*16 array, dimension (N)
+*>          On input, the vector X.
+*>          On output, CX is overwritten with C*X + S*Y.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of CX.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*>          CY is COMPLEX*16 array, dimension (N)
+*>          On input, the vector Y.
+*>          On output, CY is overwritten with -CONJG(S)*X + C*Y.
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*>          INCY is INTEGER
+*>          The increment between successive values of CY.  INCX <> 0.
+*> \endverbatim
+*>
+*> \param[in] C
+*> \verbatim
+*>          C is DOUBLE PRECISION
+*> \endverbatim
+*>
+*> \param[in] S
+*> \verbatim
+*>          S is COMPLEX*16
+*>          C and S define a rotation
+*>             [  C          S  ]
+*>             [ -conjg(S)   C  ]
+*>          where C*C + S*CONJG(S) = 1.0.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZROT( N, CX, INCX, CY, INCY, C, S )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, INCY, N
+      DOUBLE PRECISION   C
+      COMPLEX*16         S
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         CX( * ), CY( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Local Scalars ..
+      INTEGER            I, IX, IY
+      COMPLEX*16         STEMP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCONJG
+*     ..
+*     .. Executable Statements ..
+*
+      IF( N.LE.0 )
+     $   RETURN
+      IF( INCX.EQ.1 .AND. INCY.EQ.1 )
+     $   GO TO 20
+*
+*     Code for unequal increments or equal increments not equal to 1
+*
+      IX = 1
+      IY = 1
+      IF( INCX.LT.0 )
+     $   IX = ( -N+1 )*INCX + 1
+      IF( INCY.LT.0 )
+     $   IY = ( -N+1 )*INCY + 1
+      DO 10 I = 1, N
+         STEMP = C*CX( IX ) + S*CY( IY )
+         CY( IY ) = C*CY( IY ) - DCONJG( S )*CX( IX )
+         CX( IX ) = STEMP
+         IX = IX + INCX
+         IY = IY + INCY
+   10 CONTINUE
+      RETURN
+*
+*     Code for both increments equal to 1
+*
+   20 CONTINUE
+      DO 30 I = 1, N
+         STEMP = C*CX( I ) + S*CY( I )
+         CY( I ) = C*CY( I ) - DCONJG( S )*CX( I )
+         CX( I ) = STEMP
+   30 CONTINUE
+      RETURN
+      END
diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 37c5af3984..6a5a5a569a 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -156,6 +156,11 @@ CNTX_INIT_PROTS( rv32iv )
 CNTX_INIT_PROTS( rv64iv )
 #endif
 
+// -- SiFive architectures --
+
+#ifdef BLIS_CONFIG_SIFIVE_X280
+CNTX_INIT_PROTS( sifive_x280 )
+#endif
 
 // -- Generic --
 
@@ -296,6 +301,12 @@ CNTX_INIT_PROTS( generic )
 #include "bli_family_bgq.h"
 #endif
 
+// -- SiFive families --
+
+#ifdef BLIS_FAMILY_SIFIVE_X280
+#include "bli_family_sifive_x280.h"
+#endif
+
 // -- Generic --
 
 #ifdef BLIS_FAMILY_GENERIC
@@ -386,5 +397,12 @@ CNTX_INIT_PROTS( generic )
 #include "bli_kernels_rviv.h"
 #endif
 
+// -- SiFive RISC-V architectures --
+
+#ifdef BLIS_KERNELS_SIFIVE_X280
+#include "bli_kernels_sifive_x280.h"
+#endif
+
+
 #endif
 
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 62ccd0c41a..9771ec5791 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -975,6 +975,9 @@ typedef enum
 	BLIS_ARCH_RV32IV,
 	BLIS_ARCH_RV64IV,
 
+	// SiFive
+	BLIS_ARCH_SIFIVE_X280,
+
 	// Generic architecture/configuration
 	BLIS_ARCH_GENERIC,
 
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index 96e21b99f8..56f5aba9a9 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -67,6 +67,17 @@ typedef struct barrier_s barrier_t;
 #endif
 #endif
 
+// Define hpx_barrier_t, which is specific to the barrier used in HPX
+// implementation. This needs to be done first since it is (potentially)
+// used within the definition of thrcomm_t below.
+
+#ifdef BLIS_ENABLE_HPX
+typedef struct hpx_barrier_t
+{
+	void* handle;
+} hpx_barrier_t;
+#endif
+
 // Define the thrcomm_t structure, which will be common to all threading
 // implementations.
 
@@ -124,9 +135,7 @@ typedef struct thrcomm_s
 	// -- Fields specific to HPX --
 
 	#ifdef BLIS_ENABLE_HPX
-	#ifdef BLIS_USE_HPX_BARRIER
-	hpx::barrier<> * barrier;
-	#endif
+	hpx_barrier_t barrier;
 	#endif
 
 } thrcomm_t;
diff --git a/frame/thread/bli_thrcomm_hpx.cpp b/frame/thread/bli_thrcomm_hpx.cpp
index 323871ef80..0947dc81df 100644
--- a/frame/thread/bli_thrcomm_hpx.cpp
+++ b/frame/thread/bli_thrcomm_hpx.cpp
@@ -36,43 +36,36 @@
 
 #ifdef BLIS_ENABLE_HPX
 
+#include <hpx/synchronization/barrier.hpp>
 extern "C" {
 
-#ifdef BLIS_USE_HPX_BARRIER
-
 // Define the pthread_barrier_t implementations of the init, cleanup, and
 // barrier functions.
 
-void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
+void hpx_barrier_init( hpx_barrier_t* barrier, dim_t n_threads )
 {
-	if ( comm == nullptr ) return;
-
-	//comm->sent_object             = nullptr;
-	//comm->n_threads               = n_threads;
-	comm->ti                      = BLIS_HPX;
-	//comm->barrier_sense           = 0;
-	//comm->barrier_threads_arrived = 0;
-
-	comm->barrier = new hpx:barrier<>();
+	if ( barrier == nullptr ) return;
+	barrier->handle = new hpx::barrier<>( n_threads );
 }
 
-void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
+void hpx_barrier_destroy( hpx_barrier_t* barrier )
 {
-	if ( comm == nullptr ) return;
+	if ( barrier == nullptr ) return;
 
-	delete comm->barrier;
-}
+	auto* barrier_ = reinterpret_cast<hpx::barrier<>*>( barrier->handle );
+	barrier->handle = nullptr;
 
-void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
-{
-	comm->barrier->arrive_and_wait();
+	delete barrier_; 
 }
 
-#else
+void hpx_barrier_arrive_and_wait( hpx_barrier_t* barrier )
+{
+	if ( barrier == nullptr ) return;
+	auto* barrier_ = reinterpret_cast<hpx::barrier<>*>( barrier->handle );
 
-// Define the non-hpx::barrier implementations of the init, cleanup,
-// and barrier functions. These are the default unless the hpx::barrier
-// versions are requested at compile-time.
+	if ( barrier_ == nullptr ) return;
+	barrier_->arrive_and_wait();
+}
 
 void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 {
@@ -81,22 +74,24 @@ void bli_thrcomm_init_hpx( dim_t n_threads, thrcomm_t* comm )
 	comm->sent_object             = nullptr;
 	comm->n_threads               = n_threads;
 	comm->ti                      = BLIS_HPX;
-	comm->barrier_sense           = 0;
-	comm->barrier_threads_arrived = 0;
+	// comm->barrier_sense           = 0;
+	// comm->barrier_threads_arrived = 0;
+
+	hpx_barrier_init( &comm->barrier, n_threads );
 }
 
 void bli_thrcomm_cleanup_hpx( thrcomm_t* comm )
 {
+	if ( comm == nullptr ) return;
+	hpx_barrier_destroy( &comm->barrier );
 }
 
 void bli_thrcomm_barrier_hpx( dim_t t_id, thrcomm_t* comm )
 {
-	bli_thrcomm_barrier_atomic( t_id, comm );
+	hpx_barrier_arrive_and_wait( &comm->barrier );
 }
 
-} // extern "C"
-
-#endif
+}
 
 #endif
 
diff --git a/frame/thread/bli_thread_hpx.cpp b/frame/thread/bli_thread_hpx.cpp
index f69a0f5d7e..baf2eb3f2d 100644
--- a/frame/thread/bli_thread_hpx.cpp
+++ b/frame/thread/bli_thread_hpx.cpp
@@ -36,9 +36,10 @@
 
 #ifdef BLIS_ENABLE_HPX
 
-#include <hpx/local/execution.hpp>
-#include <hpx/parallel/algorithms/for_each.hpp>
+#include <hpx/execution.hpp>
 #include <hpx/hpx_start.hpp>
+#include <hpx/parallel/algorithms/for_loop.hpp>
+#include <hpx/runtime_local/run_as_hpx_thread.hpp>
 
 extern "C"
 {
@@ -56,12 +57,21 @@ void bli_thread_launch_hpx
 	pool_t*    gl_comm_pool = nullptr;
 	thrcomm_t* gl_comm      = bli_thrcomm_create( ti, gl_comm_pool, n_threads );
 
-	auto irange = hpx::util::counting_shape(n_threads);
-
-	hpx::for_each(hpx::execution::par, hpx::util::begin(irange), hpx::util::end(irange),
-	[&gl_comm, &func, &params](const dim_t tid)
+	// Execute func on hpx-runtime with n_threads.
+	hpx::threads::run_as_hpx_thread([&]()
 	{
-		func( gl_comm, tid, params );
+		std::vector<hpx::future<void>> futures;
+		futures.reserve(n_threads);
+
+		for (dim_t tid = 0; tid < n_threads; ++tid)
+		{
+			futures.push_back(hpx::async([tid, &gl_comm, &func, &params]()
+			{
+			  func( gl_comm, tid, params );
+			}));
+		}
+
+		hpx::wait_all(futures);
 	});
 
 	// Free the global communicator, because the root thrinfo_t node
@@ -76,7 +86,7 @@ void bli_thread_initialize_hpx( int argc, char** argv )
 
 int bli_thread_finalize_hpx()
 {
-	hpx::apply([]() { hpx::finalize(); });
+	hpx::post([]() { hpx::finalize(); });
 	return hpx::stop();
 }
 
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
new file mode 100644
index 0000000000..2b7ad6fe7d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define ADDV(...)  ADDV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_addv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef ADDV
+#undef ADDV_
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..d5343befe0
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+    // Computes y := y + conjx(x)
+    (void) cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFADD_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+        else
+            yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..d4e7d4a45e
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef ADDV
+
+ADDV(PRECISION_CHAR, void)
+{
+    // Computes y = y + conjx(x)
+    //           == y +   x       (real case)
+    
+    (void) cntx;
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFADD_VV(PREC, LMUL)(yvec, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // ADDV
diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
new file mode 100644
index 0000000000..c423dd131d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c
@@ -0,0 +1,293 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const float* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 4;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 4)
+            __asm__("vle32.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vfabs.v v8, v24");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vfabs.v v24, v24");
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const double* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 8;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 8)
+            __asm__("vle64.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vfabs.v v8, v24");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vfabs.v v24, v24");
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const scomplex* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 8;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 8)
+            __asm__("vlseg2e32.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("vfabs.v v24, v24");
+        __asm__("vfabs.v v28, v28");
+        __asm__("vfadd.vv v24, v24, v28");
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vmv4r.v v8, v24");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            // keep vl same, change SEW and LMUL
+            __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
+
+void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx,
+                     dim_t *index, const cntx_t *cntx) {
+    // assumes 64-bit index
+    (void)cntx;
+    const dcomplex* restrict x = x_;
+
+    if (n <= 1) {
+        *index = 0;
+        return;
+    }
+    incx *= 16;
+    size_t avl = n;
+    size_t offset = 0;
+    bool first = true;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma"
+                         : "=r"(vl)
+                         : "r"(avl));
+        if (incx == 16)
+            __asm__("vlseg2e64.v v24, (%0)" : : "r"(x));
+        else
+            __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("vfabs.v v24, v24");
+        __asm__("vfabs.v v28, v28");
+        __asm__("vfadd.vv v24, v24, v28");
+        // check for NaN
+        __asm__ volatile("vmfne.vv v0, v24, v24");
+        dim_t nan_index;
+        __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index));
+        if (nan_index != -1) {
+            *index = nan_index + offset;
+            return;
+        }
+        if (first) {
+            __asm__("vmv4r.v v8, v24");
+            __asm__("vid.v v16");
+            first = false;
+        } else {
+            __asm__("vmflt.vv v0, v8, v24");
+            __asm__("vmerge.vvm v8, v8, v24, v0");
+            __asm__("vid.v v24");
+            __asm__("vadd.vx v24, v24, %0" : : "r"(offset));
+            __asm__("vmerge.vvm v16, v16, v24, v0");
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        offset += vl;
+        avl -= vl;
+    }
+    __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n));
+    __asm__("vmv.s.x v0, zero");
+    __asm__("vfredmax.vs v0, v8, v0");
+    __asm__("vrgather.vi v24, v0, 0");
+    __asm__("vmfeq.vv v0, v8, v24");
+    uint64_t imax = -1;
+    __asm__("vmv.s.x v24, %0" : : "r"(imax));
+    __asm__("vredminu.vs v24, v16, v24, v0.t");
+    __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma");
+    __asm__("vse64.v v24, (%0)" : : "r"(index));
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..3b29f898df
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c
@@ -0,0 +1,129 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict beta_,          \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx            \
+)
+
+#define AXPBYV(...)  AXPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr
+#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR)
+#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr
+#define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPBYV
+#undef AXPBYV_
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..31fc584b97
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,121 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjx(x)
+    
+    if (n <= 0) return;
+
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (alpha->real == 0 && alpha->imag == 0 && beta->real == 0 && beta->imag == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+    if (alpha->real == 0 && alpha->imag == 0){
+        SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+        return;
+    }
+    if (beta->real == 0 && beta->imag == 0){
+        SCAL2V(PRECISION_CHAR)(conjx, n, alpha, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we 
+    // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+    // TO DO (optimization): special cases for alpha = +-1, +-i, beta = +-1, +-i
+
+    // alpha and beta are both nonzero
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, temp_real, temp_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        // Computed as:
+        // y.real = beta.real * y.real - beta.imag * y.imag + alpha.real * x.real - alpha.imag * conj(x.imag)
+        // y.imag = beta.real * y.imag + beta.imag * y.real + alpha.imag * x.real + alpha.real * conj(x.imag)
+        temp_real = VFMUL_VF(PREC, LMUL)  (yvec_real, beta->real, vl);
+        temp_imag = VFMUL_VF(PREC, LMUL)  (yvec_imag, beta->real, vl);
+        temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, beta->imag, yvec_imag, vl);
+        temp_imag = VFMACC_VF(PREC, LMUL) (temp_imag, beta->imag, yvec_real, vl);
+        yvec_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->real, xvec_real, vl);
+        yvec_imag = VFMACC_VF(PREC, LMUL) (temp_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE) {
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPBYV
diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..33eafc5d12
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,98 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPBYV
+
+AXPBYV(PRECISION_CHAR, void)
+{
+    // Computes y := beta * y + alpha * conjx(x)
+    //            == beta * y + alpha *    x        (real case)
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    
+    if (*alpha == 0 && *beta == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+    if (*alpha == 0){
+        SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx);
+        return;
+    }
+    if (*beta == 0){
+        SCAL2V(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we 
+    // will canonicalize NaNs whereas the reference code will propagate NaN payloads.
+
+    // TO DO (optimization): special cases for alpha = +-1, beta = +-1
+
+    // alpha and beta are both nonzero
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        yvec = VFMUL_VF(PREC, LMUL) (yvec, *beta, vl);
+        yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+        
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL)(y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+}
+
+#endif // AXPYBV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..3f9ebd3b04
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c
@@ -0,0 +1,119 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define AXPYV(...)  AXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPYV
+#undef AXPYV_
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..dc520d2125
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+    // Computes y := y + alpha * conjx(x)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+    if (alpha->real == 0 && alpha->imag == 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFMACC_VF(PREC, LMUL)( yvec_real, alpha->real, xvec_real, vl);
+        yvec_imag = VFMACC_VF(PREC, LMUL)( yvec_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..0c2cda842f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPYV
+
+AXPYV(PRECISION_CHAR, void)
+{
+    // Computes y = y + alpha * conj(x)
+    //           == y + alpha * x       (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    if (*alpha == 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // AXPYV
diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
new file mode 100644
index 0000000000..3571877759
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c
@@ -0,0 +1,272 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)conjx;
+    (void)cntx;
+    const float* restrict x = x_;
+    float* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)conjx;
+    (void)cntx;
+    const double* restrict x = x_;
+    double* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict x = x_;
+    scomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    if (conjx == BLIS_NO_CONJUGATE) {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLE "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    } else {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            __asm__("vfneg.v v4, v4");
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define SH_ADD "sh3add "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict x = x_;
+    dcomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE &&
+        incy == 2 * FLT_SIZE) {
+        size_t avl = 2 * n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+            __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl));
+            __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl));
+            avl -= vl;
+        }
+    } else {
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                             : "=r"(vl)
+                             : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            else
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+
+            if (conjx == BLIS_CONJUGATE)
+                __asm__("vfneg.v v4, v4");
+
+            if (incy == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+            __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+            avl -= vl;
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
new file mode 100644
index 0000000000..0dc8565400
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\
+          conj_t           conjxt,        \
+          conj_t           conjy,         \
+          dim_t            n,             \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict y_, inc_t incy, \
+          T*      restrict rho_,           \
+    const cntx_t*          cntx           \
+)
+
+#define DOTV(...)  DOTV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTV
+#undef DOTV_
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..250fab46e6
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+    // Computes rho = conjxt(x)^T * conjy(y)
+    (void) cntx;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    
+    if (n <= 0) {
+        rho->real = 0;
+        rho->imag = 0;
+        return;
+    }
+
+    // Instead of conjugating x, switch conjugation on y
+    //  and conjugate rho at the end
+    conj_t conjrho = conjxt;
+    if (conjxt == BLIS_CONJUGATE)
+        bli_toggle_conj(&conjy); // Switch conjugation of y
+
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        if (first) {
+            acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+            acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+            first = false;
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+        }
+        if (conjy == BLIS_NO_CONJUGATE) {
+            acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+    if (conjrho == BLIS_CONJUGATE) {
+        sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+    }
+    rho->real = VFMV_F_S(PREC)(sum_real);
+    rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..0ec8e6328a
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTV
+
+DOTV(PRECISION_CHAR, void)
+{
+    // Computes rho = conjxt(x)^T * conjy(y)
+    //             ==     x^T     *    y       (real case)
+    (void) cntx;
+    (void) conjxt; // Suppress unused parameter warnings
+    (void) conjy;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+
+    if (n <= 0) {
+        *rho = 0;
+        return;
+    }
+
+    RVV_TYPE_F(PREC, LMUL) acc;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (first) {
+            acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+            first = false;
+        } else
+            acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = VFMV_F_S(PREC)(sum);
+}
+
+#endif // DOTV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
new file mode 100644
index 0000000000..048f8d2983
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c
@@ -0,0 +1,130 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\
+          conj_t           conjxt,        \
+          conj_t           conjy,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict y_, inc_t incy, \
+    const T*      restrict beta_,          \
+          T*      restrict rho_,           \
+    const cntx_t*          cntx           \
+)
+
+#define DOTXV(...)  DOTXV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+#define FMA fmaf
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+#define FMA fma
+
+#include "./bli_dotxv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+#undef FMA
+
+#undef DOTXV
+#undef DOTXV_
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..8245e8e057
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c
@@ -0,0 +1,130 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+    // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    
+    if (beta->real == 0 && beta->imag == 0){
+        rho->real = 0;
+        rho->imag = 0;
+    } else if (!(beta->real == 1 && beta->imag == 0)) {
+        DATATYPE temp = *rho;
+        rho->real =  rho->real * beta->real - rho->imag * beta->imag;
+        rho->imag =  temp.real * beta->imag + rho->imag * beta->real;
+    }
+
+    if (n <= 0 || (alpha->real == 0 && alpha->imag == 0))
+        return;
+
+    // Instead of conjugating x, switch conjugation on y
+    //  and conjugate dot product at the end
+    conj_t conjsum = conjxt;
+    if (conjxt == BLIS_CONJUGATE)
+        bli_toggle_conj(&conjy); // Switch conjugation of y
+
+    // Compute dot product
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        if (first) {
+            acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl);
+            acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl);
+            first = false;
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl);
+        }
+        if (conjy == BLIS_NO_CONJUGATE) {
+            acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl);
+        } else {
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl);
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+
+    if (conjsum == BLIS_CONJUGATE) {
+        sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1);
+    }
+    DATATYPE dot = {VFMV_F_S(PREC)(sum_real), VFMV_F_S(PREC)(sum_imag)};
+
+    // Accumulate alpha * dot
+    rho->real = fma( alpha->real, dot.real, rho->real);
+    rho->real = fma(-alpha->imag, dot.imag, rho->real);
+    rho->imag = fma( alpha->imag, dot.real, rho->imag);
+    rho->imag = fma( alpha->real, dot.imag, rho->imag);
+
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..f9d9346973
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTXV
+
+DOTXV(PRECISION_CHAR, void)
+{
+    // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y)
+    //             == beta * rho + alpha *     x^T     *    y       (real case)
+    
+    (void) cntx;
+    (void) conjxt; // Suppress unused parameter warnings
+    (void) conjy;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict beta = beta_;
+    DATATYPE* restrict rho = rho_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+
+    if (*beta == 0)
+        *rho = 0;
+    else if (*beta != 1.0f)
+        *rho *= *beta;
+    
+    if (n <= 0 || *alpha == 0)
+        return;
+
+    // Compute dot product
+    RVV_TYPE_F(PREC, LMUL) acc;
+    size_t avl = n;
+    bool first = true;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        if (first) {
+            acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl);
+            first = false;
+        } else
+            acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = fma(*alpha, VFMV_F_S(PREC)(sum), *rho);
+}
+
+#endif // DOTXV
diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
new file mode 100644
index 0000000000..cbca885929
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c
@@ -0,0 +1,221 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    float* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    float one = 1.f;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    double* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    double one = 1.;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfrdiv.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    scomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                           const cntx_t *cntx) {
+    (void)cntx;
+    dcomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfneg.v v4, v4");
+            __asm__("vfmul.vv v8, v0, v0");
+            __asm__("vfmacc.vv v8, v4, v4");
+            __asm__("vfdiv.vv v0, v0, v8");
+            __asm__("vfdiv.vv v4, v4, v8");
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
new file mode 100644
index 0000000000..51edc92214
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c
@@ -0,0 +1,266 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FDIV "fdiv.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    float* restrict x = x_;
+    if (n <= 0 || *alpha == 0.f || *alpha == 1.f)
+        return;
+
+    float one = 1.f;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+    __asm__(FDIV "f0, f0, f1");
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FDIV "fdiv.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    double* restrict x = x_;
+    if (n <= 0 || *alpha == 0. || *alpha == 1.)
+        return;
+
+    double one = 1.;
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one));
+    __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha));
+    __asm__(FDIV "f0, f0, f1");
+    incx *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE) {
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v0, v0, f0");
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FDIV
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FDIV "fdiv.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    scomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f))
+        return;
+
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FMUL "f2, f0, f0");
+    __asm__(FMADD "f2, f1, f1, f2");
+    __asm__(FDIV "f0, f0, f2");
+    __asm__(FDIV "f1, f1, f2");
+    if (conjalpha == BLIS_NO_CONJUGATE)
+        __asm__(FNEG "f1, f1");
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FDIV
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FDIV "fdiv.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                            void * restrict x_, inc_t incx,
+                            const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    dcomplex* restrict x = x_;
+    if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.))
+        return;
+
+    __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FMUL "f2, f0, f0");
+    __asm__(FMADD "f2, f1, f1, f2");
+    __asm__(FDIV "f0, f0, f2");
+    __asm__(FDIV "f1, f1, f2");
+    if (conjalpha == BLIS_NO_CONJUGATE)
+        __asm__(FNEG "f1, f1");
+    incx *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE) {
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        } else {
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+            __asm__("vfmul.vf v8, v0, f0");
+            __asm__("vfmul.vf v12, v4, f0");
+            __asm__("vfnmsac.vf v8, f1, v4");
+            __asm__("vfmacc.vf v12, f1, v0");
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        }
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
new file mode 100644
index 0000000000..cd2dd2c188
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c
@@ -0,0 +1,124 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\
+          conj_t           conjx,         \
+          dim_t            n,             \
+    const T*      restrict alpha_,         \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t*          cntx           \
+)
+
+#define SCAL2V(...)  SCAL2V_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scal2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCAL2V
+#undef SCAL2V_
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..4a25ce3e32
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c
@@ -0,0 +1,100 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+    // Computes y = alpha * conjx(x)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+    if (alpha->real == 0 && alpha->imag == 0) {
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+
+    if (alpha->real == 1 && alpha->imag == 0) {
+        COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+        yvec_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+        yvec_imag = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->imag, vl);
+        if (conjx == BLIS_NO_CONJUGATE) {
+            yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFMACC_VF( PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            yvec_real = VFMACC_VF( PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl);
+            yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl);
+        }
+
+        // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use 
+        // __riscv_vcreate_v_f once they become available in LLVM.
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wuninitialized"
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+        #pragma GCC diagnostic pop
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..7084e15cf5
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCAL2V
+
+SCAL2V(PRECISION_CHAR, void)
+{
+    // Computes y = alpha * conjx(x)
+    //           == alpha *    x       (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+    if (*alpha == 0) {
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx);
+        return;
+    }
+
+    if (*alpha == 1) {
+        COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // SCAL2V
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
new file mode 100644
index 0000000000..b5788d632d
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c
@@ -0,0 +1,120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\
+          conj_t  conjalpha,               \
+          dim_t   n,                       \
+    const T*      restrict alpha_,         \
+          T*      restrict x_, inc_t incx, \
+    const cntx_t* cntx                     \
+)
+
+#define SCALV(...)  SCALV_(__VA_ARGS__)
+
+#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm
+#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_scalv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SCALV
+#undef SCALV_
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..c6803c9676
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+    // Computes x = conjalpha(alpha) * x
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+    
+    if (n <= 0 || (alpha->real == 1 && alpha->imag == 0)) return;
+
+    if (alpha->real == 0 && alpha->imag==0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+
+        RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl);
+        RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL)(xvec_imag, alpha->real, vl);
+        if (conjalpha == BLIS_NO_CONJUGATE) {
+            temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, alpha->imag, xvec_imag, vl);
+            temp_imag = VFMACC_VF(PREC, LMUL)( temp_imag, alpha->imag, xvec_real, vl);
+        } else {
+            temp_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->imag, xvec_imag, vl);
+            temp_imag = VFNMSAC_VF(PREC, LMUL)(temp_imag, alpha->imag, xvec_real, vl);
+        }
+
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, temp_real);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, temp_imag);
+
+        if (incx == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, xvec, vl);
+        
+        x += vl*incx;
+        avl -= vl;
+    }
+
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..2b4e31d359
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SCALV
+
+SCALV(PRECISION_CHAR, void)
+{
+    // Computes x = conjalpha(alpha) * x
+    //           ==           alpha  * x       (real case)
+    
+    (void) conjalpha; // Suppress unused parameter warnings
+    const DATATYPE* restrict alpha = alpha_;
+    DATATYPE* restrict x = x_;
+
+    if (n <= 0 || *alpha == 1) return;
+
+    if (*alpha == 0){
+        SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx);
+        return;
+    }
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl);
+
+        if (incx == 1)
+            VSE_V_F(PREC, LMUL) (x, xvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl);
+
+        x += vl * incx;
+        avl -= vl;
+    }
+}
+
+#endif // SCALV
diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
new file mode 100644
index 0000000000..ef9091f16c
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c
@@ -0,0 +1,204 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    float* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    incx *= FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)conjalpha;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    double* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    incx *= FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLSE "vlse32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+
+void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    scomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(VLSE "v4, (t0), zero");
+    if (conjalpha == BLIS_CONJUGATE)
+        __asm__("vfneg.v v4, v4");
+    incx *= 2 * FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSE
+#undef VSSEG2
+#undef VSSSEG2
+
+#define FLT_SIZE 8
+#define VLSE "vlse64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_,
+                    void * restrict x_, inc_t incx, const cntx_t *cntx) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    dcomplex* restrict x = x_;
+    if (n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma"
+                     :
+                     : "r"(n), "i"(8 * FLT_SIZE));
+    __asm__(VLSE "v0, (%0), zero" : : "r"(alpha));
+    __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(VLSE "v4, (t0), zero");
+    if (conjalpha == BLIS_CONJUGATE)
+        __asm__("vfneg.v v4, v4");
+    incx *= 2 * FLT_SIZE;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
new file mode 100644
index 0000000000..e6b483a3f8
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c
@@ -0,0 +1,118 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t* cntx                     \
+)
+
+#define SUBV(...)  SUBV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_subv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef SUBV
+#undef SUBV_
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..2d4a1a017f
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+    // Computes y := y - conjx(x)
+    (void) cntx;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        yvec_real = VFSUB_VV(PREC, LMUL)(yvec_real, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+        else
+            yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl);
+
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real);
+        yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..b158594319
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef SUBV
+
+SUBV(PRECISION_CHAR, void)
+{
+    // Computes y = y - conjx(x)
+    //           == y -   x       (real case)
+    (void) cntx;
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFSUB_VV(PREC, LMUL)(yvec, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // SUBV
diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
new file mode 100644
index 0000000000..2342e254a2
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c
@@ -0,0 +1,245 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_,
+                     inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    float* restrict x = x_;
+    float* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    double* restrict x = x_;
+    double* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    scomplex* restrict x = x_;
+    scomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * 2 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VLE "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSE "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+
+void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx,
+                     void * restrict y_, inc_t incy, const cntx_t *cntx) {
+    (void)cntx;
+    dcomplex* restrict x = x_;
+    dcomplex* restrict y = y_;
+    if (n <= 0)
+        return;
+
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = n;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VLSEG2 "v0, (%0)" : : "r"(x));
+        else
+            __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VLSEG2 "v8, (%0)" : : "r"(y));
+        else
+            __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy));
+
+        if (incx == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v8, (%0)" : : "r"(x));
+        else
+            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx));
+        if (incy == 2 * FLT_SIZE)
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(y));
+        else
+            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy));
+
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..dce4085bff
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\
+          conj_t           conjx,          \
+          dim_t            n,              \
+    const T*      restrict x_, inc_t incx, \
+    const T*      restrict beta_,          \
+          T*      restrict y_, inc_t incy, \
+    const cntx_t* restrict cntx            \
+)
+
+#define XPBYV(...)  XPBYV_(__VA_ARGS__)
+
+#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm
+#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_xpbyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef XPBYV
+#undef XPBYV_
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..4c86e8b36a
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+    // Computes y = beta * y + conjx(x)
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+    
+    if (n <= 0) return;
+
+    if (beta->real == 0 && beta->imag == 0){
+        COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // TO DO (optimization): beta = +-1, +-i special cases
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+
+        // xpbyv is computed with FMAs as follows:
+        // y[i].real = (      x[i].real + beta.real * y[i].real) - beta.imag * y[i].imag
+        // y[i].imag = (conjx(x[i].imag + beta.imag * y[i].real) + beta.real * y[i].imag
+
+        xvec_real = VFMACC_VF( PREC, LMUL)(xvec_real, beta->real, yvec_real, vl);
+        xvec_real = VFNMSAC_VF(PREC, LMUL)(xvec_real, beta->imag, yvec_imag, vl);
+        if (conjx == BLIS_NO_CONJUGATE)
+            xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+        else
+            xvec_imag = VFMSAC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl);
+        xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->real, yvec_imag, vl);
+
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real);
+        xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag);
+
+        if (incy == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, xvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, xvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        avl -= vl;
+    }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..b23272fea4
--- /dev/null
+++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c
@@ -0,0 +1,84 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef XPBYV
+
+XPBYV(PRECISION_CHAR, void)
+{
+    // Computes y = beta * y + conjx(x)
+    //           == beta * y +    x       (real case)
+    (void) conjx; // Suppress unused parameter warnings
+    const DATATYPE* restrict beta = beta_;
+    const DATATYPE* restrict x = x_;
+    DATATYPE* restrict y = y_;
+
+    if (n <= 0) return;
+
+    if (*beta == 0){
+        COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx);
+        return;
+    }
+
+    // TO DO (optimization): beta = +-1 special cases
+
+    size_t avl = n;
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL) (x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL) (y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+
+        yvec = VFMADD_VF(PREC, LMUL)(yvec, *beta, xvec, vl);
+
+        if (incy == 1)
+            VSE_V_F(PREC, LMUL) (y, yvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl);
+
+        x += vl * incx;
+        y += vl * incy;
+        avl -= vl;
+    }
+}
+
+#endif // XPBYV
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
new file mode 100644
index 0000000000..1b5ce3b962
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+
+#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\
+          conj_t            conjx,              \
+          conj_t            conjy,              \
+          dim_t                 n,              \
+    const T*      restrict alphax_,             \
+    const T*      restrict alphay_,             \
+    const T*      restrict      x_, inc_t incx, \
+    const T*      restrict      y_, inc_t incy, \
+          T*      restrict      z_, inc_t incz, \
+    const cntx_t* restrict   cntx               \
+)
+
+#define AXPY2V(...)  AXPY2V_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_axpy2v_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef AXPY2V
+#undef AXPY2V_
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..9b57198272
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+    // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+    const DATATYPE* restrict alphax = alphax_;
+    const DATATYPE* restrict alphay = alphay_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+    
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+        
+        if (incz == 1)
+            zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+        else
+            zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+        zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+        zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+        //  + alphax * conjx(x)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->real, xvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphax->imag, xvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->real, xvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->imag, xvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphax->real, xvec_imag, vl);
+        }
+
+        //  + alphay * conjy(y)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->real, yvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->imag, yvec_real, vl);
+        if (conjy == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphay->imag, yvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->real, yvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->imag, yvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphay->real, yvec_imag, vl);
+        }
+
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+        if (incz == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..cebb159973
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c
@@ -0,0 +1,91 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef AXPY2V
+
+AXPY2V(PRECISION_CHAR, void)
+{
+    // Computes z := z + alphax * conjx(x) + alphay * conjy(y)
+    //            == z + alphax *    x     + alphay *    y      (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    (void) conjy;
+    const DATATYPE* restrict alphax = alphax_;
+    const DATATYPE* restrict alphay = alphay_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (incz == 1)
+            zvec = VLE_V_F(PREC, LMUL)(z, vl);
+        else
+            zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, xvec, vl);
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphay, yvec, vl);
+        
+        if (incz == 1)
+            VSE_V_F(PREC, LMUL)(z, zvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+}
+
+#endif // AXPY2V
diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
new file mode 100644
index 0000000000..43c2ba44e2
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c
@@ -0,0 +1,430 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
+                         inc_t lda, const void *restrict x_, inc_t incx,
+                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+    (void)conja;
+    (void)conjx;
+    (void)cntx;
+    const float *restrict alpha = alpha_;
+    const float *restrict a = a_;
+    const float *restrict x = x_;
+    float *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        // process vl elements of y at a time
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        // x_tmp traverses x
+        // a points to the vl x b block of a needed this iteration
+        // a_tmp traverses the columns of this block
+        const float* restrict x_tmp = x;
+        const float* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        if (inca == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("vfmul.vf v0, v0, ft0");
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            if (inca == FLT_SIZE)
+                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+        }
+
+        if (incy == FLT_SIZE) {
+            __asm__(VLE "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSE "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_, inc_t inca,
+                         inc_t lda, const void *restrict x_, inc_t incx,
+                         void *restrict y_, inc_t incy, const cntx_t *restrict cntx) {
+    (void)conja;
+    (void)conjx;
+    (void)cntx;
+    const double *restrict alpha = alpha_;
+    const double *restrict a = a_;
+    const double *restrict x = x_;
+    double *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        // process vl elements of y at a time
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        // x_tmp traverses x
+        // a points to the vl x b block of a needed this iteration
+        // a_tmp traverses the columns of this block
+        const double* restrict x_tmp = x;
+        const double* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        if (inca == FLT_SIZE)
+            __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("vfmul.vf v0, v0, ft0");
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            if (inca == FLT_SIZE)
+                __asm__(VLE "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+        }
+
+        if (incy == FLT_SIZE) {
+            __asm__(VLE "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSE "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft11, v0");
+            __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG "vlseg2e32.v "
+#define VLSSEG "vlsseg2e32.v "
+#define VSSEG "vsseg2e32.v "
+#define VSSSEG "vssseg2e32.v "
+
+void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_,
+                         inc_t inca, inc_t lda, const void *restrict x_,
+                         inc_t incx, void *restrict y_, inc_t incy,
+                         const cntx_t *restrict cntx) {
+    (void)cntx;
+    const scomplex *restrict alpha = alpha_;
+    const scomplex *restrict a = a_;
+    const scomplex *restrict x = x_;
+    scomplex *restrict y = y_;
+    
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+        if (inca == 2 * FLT_SIZE)
+            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+        __asm__("vfmul.vf v0, v24, ft0");
+        __asm__("vfmul.vf v4, v24, ft1");
+        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfmacc.vf v4, ft0, v28");
+        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfmsac.vf v4, ft0, v28");
+        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfnmsac.vf v4, ft0, v28");
+        } else {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfnmacc.vf v4, ft0, v28");
+        }
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+            if (inca == 2 * FLT_SIZE)
+                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            }
+        }
+
+        if (incy == 2 * FLT_SIZE) {
+            __asm__(VLSEG "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSEG "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG
+#undef VLSSEG
+#undef VSSEG
+#undef VSSSEG
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG "vlseg2e64.v "
+#define VLSSEG "vlsseg2e64.v "
+#define VSSEG "vsseg2e64.v "
+#define VSSSEG "vssseg2e64.v "
+
+void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b,
+                         const void *restrict alpha_, const void *restrict a_,
+                         inc_t inca, inc_t lda, const void *restrict x_,
+                         inc_t incx, void *restrict y_, inc_t incy,
+                         const cntx_t *restrict cntx) {
+    (void)cntx;
+    const dcomplex *restrict alpha = alpha_;
+    const dcomplex *restrict a = a_;
+    const dcomplex *restrict x = x_;
+    dcomplex *restrict y = y_;
+
+    if (m == 0 || b == 0)
+        return;
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    size_t avl = m;
+    while (avl) {
+        size_t vl;
+        __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma"
+                         : "=r"(vl)
+                         : "r"(avl), "i"(8 * FLT_SIZE));
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_tmp = a;
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+        if (inca == 2 * FLT_SIZE)
+            __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+        else
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+        __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+        __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+        __asm__("vfmul.vf v0, v24, ft0");
+        __asm__("vfmul.vf v4, v24, ft1");
+        if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfmacc.vf v4, ft0, v28");
+        } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfmsac.vf v4, ft0, v28");
+        } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+            __asm__("vfmacc.vf v0, ft1, v28");
+            __asm__("vfnmsac.vf v4, ft0, v28");
+        } else {
+            __asm__("vfnmsac.vf v0, ft1, v28");
+            __asm__("vfnmacc.vf v4, ft0, v28");
+        }
+
+        for (dim_t i = 1; i < b; ++i) {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE));
+            if (inca == 2 * FLT_SIZE)
+                __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp));
+            else
+                __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca));
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx));
+            __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda));
+            __asm__("vfmacc.vf v0, ft0, v24");
+            if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) {
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfmacc.vf v4, ft0, v28");
+            } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) {
+                __asm__("vfmacc.vf v4, ft1, v24");
+                __asm__("vfmacc.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE
+                __asm__("vfnmsac.vf v4, ft1, v24");
+                __asm__("vfnmsac.vf v0, ft1, v28");
+                __asm__("vfnmsac.vf v4, ft0, v28");
+            }
+        }
+
+        if (incy == 2 * FLT_SIZE) {
+            __asm__(VLSEG "v24, (%0)" : : "r"(y));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSEG "v24, (%0)" : : "r"(y));
+        } else {
+            __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+            __asm__("vfmacc.vf v24, ft10, v0");
+            __asm__("vfmacc.vf v28, ft10, v4");
+            __asm__("vfnmsac.vf v24, ft11, v4");
+            __asm__("vfmacc.vf v28, ft11, v0");
+            __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy));
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy));
+        avl -= vl;
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
new file mode 100644
index 0000000000..9cd1071d7a
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+
+#include <stdint.h>
+#include <riscv_vector.h>
+#include "blis.h"
+#include "../../riscv_overloaded_intrinsics.h"
+
+#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\
+          conj_t            conjxt,             \
+          conj_t             conjx,             \
+          conj_t             conjy,             \
+          dim_t                  n,             \
+    const T*      restrict  alpha_,             \
+    const T*      restrict      x_, inc_t incx, \
+    const T*      restrict      y_, inc_t incy, \
+          T*      restrict    rho_,             \
+          T*      restrict      z_, inc_t incz, \
+    const cntx_t* restrict   cntx               \
+)
+
+#define DOTAXPYV(...)  DOTAXPYV_(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PREC 32
+#define LMUL m8
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PREC 64
+#define LMUL m8
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define BASE_DT float
+#define PRECISION_CHAR c
+#define PREC 32
+#define LMUL m4
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define BASE_DT double
+#define PRECISION_CHAR z
+#define PREC 64
+#define LMUL m4
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_dotaxpyv_sifive_x280_intr_complex.c"
+
+#undef DATATYPE
+#undef BASE_DT
+#undef PRECISION_CHAR
+#undef PREC
+#undef LMUL
+#undef FLT_SIZE
+
+#undef DOTAXPYV
+#undef DOTAXPYV_
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
new file mode 100644
index 0000000000..c3cd06c523
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c
@@ -0,0 +1,151 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+    // Computes z := z + alpha * conjx(x)
+    //   and  rho := conjxt(x)^T * conjy(y)
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict rho = rho_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+    bool first = true;
+    RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec;
+        RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag;
+
+        // Loads
+        if (incx == 1)
+            xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl);
+        else
+            xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl);
+        
+        if (incy == 1)
+            yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl);
+        else
+            yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl);
+        
+        if (incz == 1)
+            zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl);
+        else
+            zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl);
+
+        xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0);
+        xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1);
+        yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0);
+        yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1);
+        zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0);
+        zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1);
+
+        // z := z + alpha * conjx(x)
+        zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->real, xvec_real, vl);
+        zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->imag, xvec_real, vl);
+        if (conjx == BLIS_NO_CONJUGATE){
+            zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alpha->imag, xvec_imag, vl);
+            zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->real, xvec_imag, vl);
+        } else {
+            zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->imag, xvec_imag, vl);
+            zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alpha->real, xvec_imag, vl);
+        }
+        
+        // rho := conjxt(x)^T * conjy(y)
+        // We accumulate the current term of the dot product as (a*c-b*d) + (a*d+b*c)*i,
+        // conjugating when necessary
+        if (first) {
+            // Initialize real part:      a*c
+            acc_real = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_real, vl);
+            // Initialize imaginary part: a*d
+            acc_imag = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_imag, vl);
+            if (conjy == BLIS_CONJUGATE)
+                acc_imag = VFNEG_VF(PREC, LMUL)(acc_imag, vl); // TO DO: eliminate this negation
+            first = false;
+        } else {
+            // Accumulate real part:      a*c
+            acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_real, yvec_real, vl);
+            // Accumulate imaginary part: a*d
+            if (conjy == BLIS_NO_CONJUGATE)
+                acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+            else
+                acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl);
+        }
+        // Finish real part:      b*d
+        if (conjxt == BLIS_NO_CONJUGATE ^ conjy == BLIS_NO_CONJUGATE)
+            // Exactly one is conjugated => add
+            acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+        else
+            acc_real = VFNMSAC_VV_TU(PREC,LMUL)(acc_real, xvec_imag, yvec_imag, vl);
+        // Finish imaginary part: b*c
+        if (conjxt == BLIS_NO_CONJUGATE)
+            acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+        else
+            acc_imag = VFNMSAC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl);
+
+        // Stores
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real);
+        zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag);
+
+        if (incz == 1)
+            VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl);
+        else
+            VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl);
+
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+    // Compute rho
+    RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)( 0.f, 1);
+    RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)( 0.f, 1);
+    sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n);
+    sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n);
+    rho->real = VFMV_F_S(PREC)(sum_real);
+    rho->imag = VFMV_F_S(PREC)(sum_imag);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
new file mode 100644
index 0000000000..adaf3610b0
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c
@@ -0,0 +1,111 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef DOTAXPYV
+
+DOTAXPYV(PRECISION_CHAR, void)
+{
+    // Computes z := z + alpha * conjx(x)
+    //            == z + alphax *    x      (real case)
+    //   and  rho := conjxt(x)^T * conjy(y)
+    //            == x^T * y                (real case)
+    
+    (void) conjx; // Suppress unused parameter warnings
+    (void) conjxt;
+    (void) conjy;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict x = x_;
+    const DATATYPE* restrict y = y_;
+    DATATYPE* restrict rho = rho_;
+    DATATYPE* restrict z = z_;
+
+    if (n <= 0)
+        return;
+
+    size_t avl = n;
+    bool first = true;
+    RVV_TYPE_F(PREC, LMUL) acc;
+
+    while (avl) {
+        size_t vl = VSETVL(PREC, LMUL)(avl);
+        RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec;
+
+        // Loads
+        if (incx == 1)
+            xvec = VLE_V_F(PREC, LMUL)(x, vl);
+        else
+            xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl);
+        
+        if (incy == 1)
+            yvec = VLE_V_F(PREC, LMUL)(y, vl);
+        else
+            yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl);
+        
+        if (incz == 1)
+            zvec = VLE_V_F(PREC, LMUL)(z, vl);
+        else
+            zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl);
+
+        // z := z + alphax * x
+        zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, xvec, vl);
+
+        // rho := x^T * y
+        if (first){
+            acc = VFMUL_VV(PREC, LMUL)( xvec, yvec, vl);
+            first = false;
+        } else {
+            acc  = VFMACC_VV_TU(PREC, LMUL)( acc,   xvec, yvec, vl);
+        }
+
+        // Store
+        if (incz == 1)
+            VSE_V_F(PREC, LMUL)(z, zvec, vl);
+        else
+            VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl);
+        
+        x += vl*incx;
+        y += vl*incy;
+        z += vl*incz;
+        avl -= vl;
+    }
+
+    // Compute rho
+    RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)( 0.f, 1);
+    sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n);
+    *rho = VFMV_F_S(PREC)(sum);
+
+}
+
+#endif // ifdef DOTAXPYV
diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
new file mode 100644
index 0000000000..ecb340707b
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c
@@ -0,0 +1,3120 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxaxpyf_sifive_x280_asm(
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+                             ) {
+  (void)conjat;
+  (void)conja;
+  (void)conjw;
+  (void)conjx;
+  (void)cntx;
+  const float *restrict alpha = alpha_;
+  const float *restrict beta = beta_;
+  const float *restrict a = a_;
+  const float *restrict w = w_;
+  const float *restrict x = x_;
+  float *restrict y = y_;
+  float *restrict z = z_;
+
+  if (b == 0)
+    return;
+  else if (m == 0 || *alpha == 0.f) {
+    // scale y by beta
+    if (*beta == 0.f)
+        bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    else
+        bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    return;
+  }
+
+  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+  inca *= FLT_SIZE;
+  lda *= FLT_SIZE;
+  incw *= FLT_SIZE;
+  incx *= FLT_SIZE;
+  incy *= FLT_SIZE;
+  incz *= FLT_SIZE;
+  inc_t a_bump = 5 * lda;
+  while (b >= 5) {
+    // compute dot product of w with 5 rows of a
+    const float* restrict w_tmp = w;
+    const float* restrict z_tmp = z;
+    const float* restrict a_col = a;
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const float* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("vmv.s.x v31, x0");
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v0, v0, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v0, v0, ft10");
+      __asm__(VSE "v0, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v0");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v4, v4, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v4, v4, ft10");
+      __asm__(VSE "v4, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v4");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v8, v8, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v8, v8, ft10");
+      __asm__(VSE "v8, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v8");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v12, v12, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v12, v12, ft10");
+      __asm__(VSE "v12, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v12");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v16, v16, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.f) {
+      __asm__("vfmul.vf v16, v16, ft10");
+      __asm__(VSE "v16, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v16");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+    b -= 5;
+  }
+
+  if (b > 0) {
+    const float* restrict w_tmp = w;
+    const float* restrict z_tmp = z;
+    const float* restrict a_col;
+    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const float* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      __asm__("vmv.v.i v20, 0");
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+    __asm__("vmv.s.x v31, x0");
+
+    switch (b) {
+    case 4:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v12, v12, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__(VSE "v12, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v12");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 3:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v8, v8, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__(VSE "v8, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v8");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 2:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v4, v4, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__(VSE "v4, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v4");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 1:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v0, v0, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.f) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__(VSE "v0, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v0");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+    }
+  } // end cleanup
+  return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxaxpyf_sifive_x280_asm(
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+                             ) {
+  (void)conjat;
+  (void)conja;
+  (void)conjw;
+  (void)conjx;
+  (void)cntx;
+  const double *restrict alpha = alpha_;
+  const double *restrict beta = beta_;
+  const double *restrict a = a_;
+  const double *restrict w = w_;
+  const double *restrict x = x_;
+  double *restrict y = y_;
+  double *restrict z = z_;
+
+  if (b == 0)
+    return;
+  else if (m == 0 || *alpha == 0.) {
+    // scale y by beta
+    if (*beta == 0.)
+        bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    else
+        bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+    return;
+  }
+
+  __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+  __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+  inca *= FLT_SIZE;
+  lda *= FLT_SIZE;
+  incw *= FLT_SIZE;
+  incx *= FLT_SIZE;
+  incy *= FLT_SIZE;
+  incz *= FLT_SIZE;
+  inc_t a_bump = 5 * lda;
+  while (b >= 5) {
+    // compute dot product of w with 5 rows of a
+    const double* restrict w_tmp = w;
+    const double* restrict z_tmp = z;
+    const double* restrict a_col = a;
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const double* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLE "v24, (%0)" : : "r"(a_row));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmul.vv v0, v24, v28");
+          __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmul.vv v4, v24, v28");
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmul.vv v8, v24, v28");
+          __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmul.vv v12, v24, v28");
+          __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : : "r"(x), "r"(incx));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmul.vv v16, v24, v28");
+          first = false;
+        }
+        else {
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmul.vf v20, v24, ft0");
+          __asm__("vfmacc.vv v0, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft1, v24");
+          __asm__("vfmacc.vv v4, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft2, v24");
+          __asm__("vfmacc.vv v8, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+          __asm__("vfmacc.vf v20, ft3, v24");
+          __asm__("vfmacc.vv v12, v24, v28");
+          __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+          __asm__("vfmacc.vf v20, ft4, v24");
+          __asm__("vfmacc.vv v16, v24, v28");
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("vmv.s.x v31, x0");
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v0, v0, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v0, v0, ft10");
+      __asm__(VSE "v0, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v0");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v4, v4, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v4, v4, ft10");
+      __asm__(VSE "v4, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v4");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v8, v8, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v8, v8, ft10");
+      __asm__(VSE "v8, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v8");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v12, v12, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v12, v12, ft10");
+      __asm__(VSE "v12, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v12");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+    __asm__("vfredusum.vs v16, v16, v31");
+    __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+    if (*beta == 0.) {
+      __asm__("vfmul.vf v16, v16, ft10");
+      __asm__(VSE "v16, (%0)" : : "r"(y));
+    }
+    else {
+      __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+      __asm__(FMUL "ft0, ft11, ft0");
+      __asm__("vfmv.s.f v30, ft0");
+      __asm__("vfmacc.vf v30, ft10, v16");
+      __asm__(VSE "v30, (%0)" : : "r"(y));
+    }
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+    __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+    b -= 5;
+  }
+
+  if (b > 0) {
+    const double* restrict w_tmp = w;
+    const double* restrict z_tmp = z;
+    const double* restrict a_col;
+    __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+    __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+    size_t avl = m;
+    bool first = true;
+    while (avl) {
+      const double* restrict a_row = a_col;
+      size_t vl;
+      __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+      if (incw == FLT_SIZE)
+        __asm__(VLE "v28, (%0)" : : "r"(w_tmp));
+      else
+        __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+      __asm__("vmv.v.i v20, 0");
+      if (inca == FLT_SIZE) {
+        // a unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLE "v24, (%0)" : : "r"(a_row));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a unit stride
+      else {
+        // a non-unit stride
+        if (first) {
+          switch (b) {
+          case 4:
+            __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmul.vv v12, v24, v28");
+          case 3:
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmul.vv v8, v24, v28");
+          case 2:
+            __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmul.vv v4, v24, v28");
+          case 1:
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmul.vv v0, v24, v28");
+          }
+          first = false;
+        }
+        else {
+          switch (b) {
+          case 4:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft3, v24");
+            __asm__("vfmacc.vv v12, v24, v28");
+          case 3:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft2, v24");
+            __asm__("vfmacc.vv v8, v24, v28");
+          case 2:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+            __asm__("vfmacc.vf v20, ft1, v24");
+            __asm__("vfmacc.vv v4, v24, v28");
+          case 1:
+            __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+            __asm__("vfmacc.vf v20, ft0, v24");
+            __asm__("vfmacc.vv v0, v24, v28");
+          }
+        }
+      } // end a non-unit stride
+
+      if (incz == FLT_SIZE) {
+        __asm__(VLE "v24, (%0)" : : "r"(z_tmp));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSE "v24, (%0)" : : "r"(z_tmp));
+      } else {
+        __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+        __asm__("vfmacc.vf v24, ft10, v20");
+        __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+      }
+
+      __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+      __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+      __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+      avl -= vl;
+    }
+
+    __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+    __asm__("vmv.s.x v31, x0");
+
+    switch (b) {
+    case 4:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v12, v12, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__(VSE "v12, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v12");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 3:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v8, v8, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__(VSE "v8, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v8");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 2:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v4, v4, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__(VSE "v4, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v4");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+      __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+    case 1:
+      __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+      __asm__("vfredusum.vs v0, v0, v31");
+      __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+      if (*beta == 0.) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__(VSE "v0, (%0)" : : "r"(y));
+      }
+      else {
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+        __asm__(FMUL "ft0, ft11, ft0");
+        __asm__("vfmv.s.f v30, ft0");
+        __asm__("vfmacc.vf v30, ft10, v0");
+        __asm__(VSE "v30, (%0)" : : "r"(y));
+      }
+    }
+  } // end cleanup
+  return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define FNEG "fneg.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxaxpyf_sifive_x280_asm
+     (
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+     )
+{
+    (void)cntx;
+    const scomplex *restrict alpha = alpha_;
+    const scomplex *restrict beta = beta_;
+    const scomplex *restrict a = a_;
+    const scomplex *restrict w = w_;
+    const scomplex *restrict x = x_;
+    scomplex *restrict y = y_;
+    scomplex *restrict z = z_;
+    
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+        // scale y by beta
+        if (beta->real == 0.f && beta->imag == 0.f)
+            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+    // and fa6-fa7 to store beta
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * w if needed.
+    conj_t conjatw = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjw);
+        bli_toggle_conj(&conjatw);
+    }
+    conj_t conjax = BLIS_NO_CONJUGATE;
+    if (conja == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conja);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjax);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incw *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    incz *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 5 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 5) {
+        // compute dot product of w with 6 rows of a
+        const scomplex* restrict w_tmp = w;
+        const scomplex* restrict z_tmp = z;
+        const scomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 5 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 5;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 5
+        const scomplex* restrict w_tmp = w;
+        const scomplex* restrict z_tmp = z;
+        const scomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            __asm__("vmv.v.i v20, 0");
+            __asm__("vmv.v.i v22, 0");
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+
+        switch (b) {
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef FNEG
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define FNEG "fneg.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxaxpyf_sifive_x280_asm
+     (
+             conj_t           conjat,
+             conj_t           conja,
+             conj_t           conjw,
+             conj_t           conjx,
+             dim_t            m,
+             dim_t            b,
+       const void*   restrict alpha_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+       const void*   restrict w_, inc_t incw,
+       const void*   restrict x_, inc_t incx,
+       const void*   restrict beta_,
+             void*   restrict y_, inc_t incy,
+             void*   restrict z_, inc_t incz,
+       const cntx_t* restrict cntx
+     )
+{
+    (void)cntx;
+    const dcomplex *restrict alpha = alpha_;
+    const dcomplex *restrict beta = beta_;
+    const dcomplex *restrict a = a_;
+    const dcomplex *restrict w = w_;
+    const dcomplex *restrict x = x_;
+    dcomplex *restrict y = y_;
+    dcomplex *restrict z = z_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+        // scale y by beta
+        if (beta->real == 0. && beta->imag == 0.)
+            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha,
+    // and fa6-fa7 to store beta
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * w if needed.
+    conj_t conjatw = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjw);
+        bli_toggle_conj(&conjatw);
+    }
+    conj_t conjax = BLIS_NO_CONJUGATE;
+    if (conja == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conja);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjax);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incw *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    incz *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 5 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 5) {
+        // compute dot product of w with 6 rows of a
+        const dcomplex* restrict w_tmp = w;
+        const dcomplex* restrict z_tmp = z;
+        const dcomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                        __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x));
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); }
+                        __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vf(v20, v22, v24, v26, ft0, ft1);
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vf(v20, v22, ft8, ft9, v24, v26);
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmul_vf(v28, v29, v16, v18, ft10, ft11);
+          }
+          else {
+            vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11);
+          }
+        }
+        else {
+          __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+          __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+          cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+          __asm__("vfmv.s.f v28, ft0");
+          __asm__("vfmv.s.f v29, ft1");
+          if (conjatw == BLIS_NO_CONJUGATE) {
+            vcmacc_vf(v28, v29, ft10, ft11, v16, v18);
+          }
+          else {
+            vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18);
+          }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 5 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 5;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 5
+        const dcomplex* restrict w_tmp = w;
+        const dcomplex* restrict z_tmp = z;
+        const dcomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incw == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw));
+            __asm__("vmv.v.i v20, 0");
+            __asm__("vmv.v.i v22, 0");
+            if (inca == 2 * FLT_SIZE) {
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjw == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjw = no conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_NO_CONJUGATE
+                else { // conjw == BLIS_CONJUGATE
+                    // a non-unit stride, conjw = conj
+                    if (first) {
+                        switch (b) {
+                        case 4:
+                            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); }
+                            __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE));
+                            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x));
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); }
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 4:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft6, ft7, v24, v26);
+                            vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft4, ft5, v24, v26);
+                            vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                            vcmacc_vf(v20, v22, ft2, ft3, v24, v26);
+                            vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                            vcmacc_vf(v20, v22, ft0, ft1, v24, v26);
+                            vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjw == BLIS_CONJUGATE
+            } // end a non-unit stride
+
+            if (incz == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp));
+            }
+            else {
+                __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+                if (conjax == BLIS_NO_CONJUGATE) {
+                    vcmacc_vf(v24, v26, ft10, ft11, v20, v22);
+                }
+                else {
+                    vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22);
+                }
+                __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz));
+            }
+
+            __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw));
+            __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+
+        switch (b) {
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v12, v14);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v8, v10);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v4, v6);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft10, ft11);
+              }
+              else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11);
+              }
+            }
+            else {
+              __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+              __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+              cmul(ft0, ft1, fa6, fa7, ft2, ft3);
+              __asm__("vfmv.s.f v28, ft0");
+              __asm__("vfmv.s.f v29, ft1");
+              if (conjatw == BLIS_NO_CONJUGATE) {
+                vcmacc_vf(v28, v29, ft10, ft11, v0, v2);
+              }
+              else {
+                vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2);
+              }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
new file mode 100644
index 0000000000..5ac2d41667
--- /dev/null
+++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c
@@ -0,0 +1,2645 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+
+void bli_sdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+    // we process 6 elements of y per iteration, using y_tmp to load/store from
+    // y a points to the 6 x m block of a needed this iteration each 6 x m block
+    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+    // 6 x vl block
+    (void)conjat;
+    (void)conjx;
+    (void)cntx;
+    const float* restrict alpha = alpha_;
+    const float* restrict a = a_;
+    const float* restrict x = x_;
+    const float* restrict beta = beta_;
+    float* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || *alpha == 0.f) {
+        // scale y by beta
+        if (*beta == 0.f)
+            bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const float* restrict x_tmp = x;
+        const float* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const float* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v0");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__(VSE "v4, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v4");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__(VSE "v8, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v8");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__(VSE "v12, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v12");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__(VSE "v16, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v16");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.f) {
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__(VSE "v20, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v20");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // compute dot product of x with remaining < 6 rows of a
+        const float* restrict x_tmp = x;
+        // a_col will move along the last row of a!
+        const float* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const float* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        __asm__("vmv.s.x v31, x0");
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__(VSE "v16, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v16");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__(VSE "v12, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v12");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__(VSE "v8, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v8");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__(VSE "v4, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v4");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.f) {
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v0");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+
+void bli_ddotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca)
+    // we process 6 elements of y per iteration, using y_tmp to load/store from
+    // y a points to the 6 x m block of a needed this iteration each 6 x m block
+    // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we
+    // use x_tmp to load from x a_row is used to load each of the 6 rows of this
+    // 6 x vl block
+    (void)conjat;
+    (void)conjx;
+    (void)cntx;
+    const double* restrict alpha = alpha_;
+    const double* restrict a = a_;
+    const double* restrict x = x_;
+    const double* restrict beta = beta_;
+    double* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || *alpha == 0.) {
+        // scale y by beta
+        if (*beta == 0.)
+            bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+    inca *= FLT_SIZE;
+    lda *= FLT_SIZE;
+    incx *= FLT_SIZE;
+    incy *= FLT_SIZE;
+    inc_t a_bump = 6 * lda; // to bump a down 6 rows
+
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const double* restrict x_tmp = x;
+        const double* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const double* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLE "v20, (%0)" : : "r"(a_row));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v0, v0, v28");
+                    __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v4, v4, v28");
+                    __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v8, v8, v28");
+                    __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v12, v12, v28");
+                    __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmul.vv v16, v16, v28");
+                    __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmul.vv v20, v20, v28");
+                    first = false;
+                }
+                else {
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v0, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v4, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v8, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v12, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                    __asm__("vfmacc.vv v16, v24, v28");
+                    __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                    __asm__("vfmacc.vv v20, v24, v28");
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__(VSE "v0, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v0");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__(VSE "v4, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v4");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__(VSE "v8, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v8");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__(VSE "v12, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v12");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__(VSE "v16, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v16");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (*beta == 0.) {
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__(VSE "v20, (%0)" : : "r"(y));
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+            __asm__(FMUL "ft0, ft11, ft0");
+            __asm__("vfmv.s.f v30, ft0");
+            __asm__("vfmacc.vf v30, ft10, v20");
+            __asm__(VSE "v30, (%0)" : : "r"(y));
+        }
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // compute dot product of x with remaining < 6 rows of a
+        const double* restrict x_tmp = x;
+        // a_col will move along the last row of a!
+        const double* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const double* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == FLT_SIZE)
+                __asm__(VLE "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == FLT_SIZE) {
+                // a unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v16, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLE "v12, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLE "v8, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_row));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLE "v24, (%0)" : : "r"(a_row));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a unit stride
+            else {
+                // a non-unit stride
+                if (first) {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v16, v16, v28");
+                    case 4:
+                        __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v12, v12, v28");
+                    case 3:
+                        __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v8, v8, v28");
+                    case 2:
+                        __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmul.vv v4, v4, v28");
+                    case 1:
+                        __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmul.vv v0, v0, v28");
+                    }
+                    first = false;
+                }
+                else {
+                    switch (b) {
+                    case 5:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v16, v24, v28");
+                    case 4:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v12, v24, v28");
+                    case 3:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v8, v24, v28");
+                    case 2:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        __asm__("vfmacc.vv v4, v24, v28");
+                    case 1:
+                        __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("vfmacc.vv v0, v24, v28");
+                    }
+                }
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        __asm__("vmv.s.x v31, x0");
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__(VSE "v16, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v16");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__(VSE "v12, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v12");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__(VSE "v8, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v8");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__(VSE "v4, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v4");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (*beta == 0.) {
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__(VSE "v0, (%0)" : : "r"(y));
+            }
+            else {
+                __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y));
+                __asm__(FMUL "ft0, ft11, ft0");
+                __asm__("vfmv.s.f v30, ft0");
+                __asm__("vfmacc.vf v30, ft10, v0");
+                __asm__(VSE "v30, (%0)" : : "r"(y));
+            }
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define FMUL "fmul.s "
+#define FMADD "fmadd.s "
+#define FNMSUB "fnmsub.s "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSE "vse32.v "
+
+void bli_cdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    (void)cntx;
+    const scomplex* restrict alpha = alpha_;
+    const scomplex* restrict a = a_;
+    const scomplex* restrict x = x_;
+    const scomplex* restrict beta = beta_;
+    scomplex* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) {
+        // scale y by beta
+        if (beta->real == 0.f && beta->imag == 0.f)
+            bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * x if needed.
+    conj_t conjatx = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjatx);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 6 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx = BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vfredusum.vs v22, v22, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0.f && beta->imag == 0.f) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 6
+        const scomplex* restrict x_tmp = x;
+        const scomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const scomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+        
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vfredusum.vs v18, v18, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0.f && beta->imag == 0.f) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    } // end cleanup
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef FMUL
+#undef FMADD
+#undef FNMSUB
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSE
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define FMUL "fmul.d "
+#define FMADD "fmadd.d "
+#define FNMSUB "fnmsub.d "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSE "vse64.v "
+
+void bli_zdotxf_sifive_x280_asm(
+              conj_t           conjat,
+              conj_t           conjx,
+              dim_t            m,
+              dim_t            b,
+        const void*   restrict alpha_,
+        const void*   restrict a_, inc_t inca, inc_t lda,
+        const void*   restrict x_, inc_t incx,
+        const void*   restrict beta_,
+              void*   restrict y_, inc_t incy,
+        const cntx_t* restrict cntx
+        ) {
+    (void)cntx;
+    const dcomplex* restrict alpha = alpha_;
+    const dcomplex* restrict a = a_;
+    const dcomplex* restrict x = x_;
+    const dcomplex* restrict beta = beta_;
+    dcomplex* restrict y = y_;
+
+    if (b == 0)
+        return;
+    else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) {
+        // scale y by beta
+        if (beta->real == 0. && beta->imag == 0.)
+            bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        else
+            bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL);
+        return;
+    }
+
+    __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE));
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE));
+    // Reduce to case when A^T is not conjugated, then conjugate
+    // computed product A^T * x if needed.
+    conj_t conjatx = BLIS_NO_CONJUGATE;
+    if (conjat == BLIS_CONJUGATE) {
+        bli_toggle_conj(&conjat);
+        bli_toggle_conj(&conjx);
+        bli_toggle_conj(&conjatx);
+    }
+    inca *= 2 * FLT_SIZE;
+    lda *= 2 * FLT_SIZE;
+    incx *= 2 * FLT_SIZE;
+    incy *= 2 * FLT_SIZE;
+    // these are used to bump a and y, resp.
+    inc_t a_bump = 6 * lda;
+    inc_t y_bump = incy - FLT_SIZE;
+    while (b >= 6) {
+        // compute dot product of x with 6 rows of a
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_col = a;
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx = BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmul_vv_conj(v20, v22, v24, v26, v28, v30);
+                        first = false;
+                    }
+                    else {
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                        vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                        vcmacc_vv_conj(v20, v22, v24, v26, v28, v30);
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("vmv.s.x v31, x0");
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v0, v0, v31");
+        __asm__("vfredusum.vs v2, v2, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v4, v4, v31");
+        __asm__("vfredusum.vs v6, v6, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v8, v8, v31");
+        __asm__("vfredusum.vs v10, v10, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v12, v12, v31");
+        __asm__("vfredusum.vs v14, v14, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v16, v16, v31");
+        __asm__("vfredusum.vs v18, v18, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+        __asm__("vfredusum.vs v20, v20, v31");
+        __asm__("vfredusum.vs v22, v22, v31");
+        __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        if (beta->real == 0. && beta->imag == 0.) {
+            if (conjatx == BLIS_NO_CONJUGATE) {
+                vcmul_vf(v28, v29, v20, v22, ft8, ft9);
+            }
+            else {
+                vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9);
+            }
+        }
+        else {
+            __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+            cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+            __asm__("vfmv.s.f v28, ft0");
+            __asm__("vfmv.s.f v29, ft1");
+            if (conjatx == BLIS_NO_CONJUGATE) {
+              vcmacc_vf(v28, v29, ft8, ft9, v20, v22);
+            }
+            else {
+              vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22);
+            }
+        }
+        __asm__(VSE "v28, (%0)" : : "r"(y));
+        __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+        __asm__(VSE "v29, (%0)" : : "r"(y));
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump));
+
+        // a += 6 * lda;
+        __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump));
+        b -= 6;
+    }
+
+    if (b > 0) {
+        // cleanup loop, 0 < b < 6
+        const dcomplex* restrict x_tmp = x;
+        const dcomplex* restrict a_col;
+        __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda));
+        size_t avl = m;
+        bool first = true;
+        while (avl) {
+            const dcomplex* restrict a_row = a_col;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            if (incx == 2 * FLT_SIZE)
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp));
+            else
+                __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx));
+            if (inca == 2 * FLT_SIZE) {
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a unit stride
+            else { // a non-unit stride
+                if (conjx == BLIS_NO_CONJUGATE) {
+                    // a non-unit stride, conjx = no conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_NO_CONJUGATE
+                else { // conjx == BLIS_CONJUGATE
+                    // a non-unit stride, conjx = conj
+                    if (first) {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmul_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmul_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                        first = false;
+                    }
+                    else {
+                        switch (b) {
+                        case 5:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v16, v18, v24, v26, v28, v30);
+                        case 4:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v12, v14, v24, v26, v28, v30);
+                        case 3:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v8, v10, v24, v26, v28, v30);
+                        case 2:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda));
+                          vcmacc_vv_conj(v4, v6, v24, v26, v28, v30);
+                        case 1:
+                          __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca));
+                          vcmacc_vv_conj(v0, v2, v24, v26, v28, v30);
+                        }
+                    }
+                } // end conjx == BLIS_CONJUGATE
+            } // end a non-unit stride
+            __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx));
+            __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca));
+            avl -= vl;
+        }
+
+        __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy));
+        y_bump = incy + FLT_SIZE;
+        __asm__("vmv.s.x v31, x0");
+        
+        switch (b) {
+        case 5:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v16, v16, v31");
+            __asm__("vfredusum.vs v18, v18, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v16, v18, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v16, v18);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 4:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v12, v12, v31");
+            __asm__("vfredusum.vs v14, v14, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v12, v14, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v12, v14);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 3:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v8, v8, v31");
+            __asm__("vfredusum.vs v10, v10, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v8, v10, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v8, v10);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 2:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v4, v4, v31");
+            __asm__("vfredusum.vs v6, v6, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v4, v6, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v4, v6);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+            __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump));
+        case 1:
+            __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE));
+            __asm__("vfredusum.vs v0, v0, v31");
+            __asm__("vfredusum.vs v2, v2, v31");
+            __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            if (beta->real == 0. && beta->imag == 0.) {
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                    vcmul_vf(v28, v29, v0, v2, ft8, ft9);
+                }
+                else {
+                    vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9);
+                }
+            }
+            else {
+                __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE));
+                cmul(ft0, ft1, ft10, ft11, ft2, ft3);
+                __asm__("vfmv.s.f v28, ft0");
+                __asm__("vfmv.s.f v29, ft1");
+                if (conjatx == BLIS_NO_CONJUGATE) {
+                  vcmacc_vf(v28, v29, ft8, ft9, v0, v2);
+                }
+                else {
+                  vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2);
+                }
+            }
+            __asm__(VSE "v28, (%0)" : : "r"(y));
+            __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE));
+            __asm__(VSE "v29, (%0)" : : "r"(y));
+        }
+    } // end cleanup
+    return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
new file mode 100644
index 0000000000..35ca23677d
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c
@@ -0,0 +1,678 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSSEG7 "vssseg7e32.v "
+
+void bli_spackm_sifive_x280_asm_7xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const float* kappa = kappa_;
+    const float* a = a_;
+    float* p = p_;
+
+    float kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        switch (cdim) {
+            case 0: __asm__("vmv.v.i v0, 0");
+            case 1: __asm__("vmv.v.i v1, 0");
+            case 2: __asm__("vmv.v.i v2, 0");
+            case 3: __asm__("vmv.v.i v3, 0");
+            case 4: __asm__("vmv.v.i v4, 0");
+            case 5: __asm__("vmv.v.i v5, 0");
+            case 6: __asm__("vmv.v.i v6, 0");
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const float* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 7:
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 6:
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast != 1.f) {
+                switch (cdim) {
+                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+            }
+            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.f) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 8
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSSEG7 "vssseg7e64.v "
+
+void bli_dpackm_sifive_x280_asm_7xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const double* kappa = kappa_;
+    const double* a = a_;
+    double* p = p_;
+
+    double kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        switch (cdim) {
+            case 0: __asm__("vmv.v.i v0, 0");
+            case 1: __asm__("vmv.v.i v1, 0");
+            case 2: __asm__("vmv.v.i v2, 0");
+            case 3: __asm__("vmv.v.i v3, 0");
+            case 4: __asm__("vmv.v.i v4, 0");
+            case 5: __asm__("vmv.v.i v5, 0");
+            case 6: __asm__("vmv.v.i v6, 0");
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const double* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 7:
+                    __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 6:
+                    __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast != 1.) {
+                switch (cdim) {
+                    case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                }
+            }
+            __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp));
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSSEG7
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+
+void bli_cpackm_sifive_x280_asm_6xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const scomplex* kappa = kappa_;
+    const scomplex* a = a_;
+    scomplex* p = p_;
+
+    scomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v0, 0");
+                    __asm__("vmv.v.i v1, 0");
+                case 1:
+                    __asm__("vmv.v.i v2, 0");
+                    __asm__("vmv.v.i v3, 0");
+                case 2:
+                    __asm__("vmv.v.i v4, 0");
+                    __asm__("vmv.v.i v5, 0");
+                case 3:
+                    __asm__("vmv.v.i v6, 0");
+                    __asm__("vmv.v.i v7, 0");
+                case 4:
+                    __asm__("vmv.v.i v8, 0");
+                    __asm__("vmv.v.i v9, 0");
+                case 5:
+                    __asm__("vmv.v.i v10, 0");
+                    __asm__("vmv.v.i v11, 0");
+            }
+        }
+        else {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v12, 0");
+                    __asm__("vmv.v.i v13, 0");
+                case 1:
+                    __asm__("vmv.v.i v14, 0");
+                    __asm__("vmv.v.i v15, 0");
+                case 2:
+                    __asm__("vmv.v.i v16, 0");
+                    __asm__("vmv.v.i v17, 0");
+                case 3:
+                    __asm__("vmv.v.i v18, 0");
+                    __asm__("vmv.v.i v19, 0");
+                case 4:
+                    __asm__("vmv.v.i v20, 0");
+                    __asm__("vmv.v.i v21, 0");
+                case 5:
+                    __asm__("vmv.v.i v22, 0");
+                    __asm__("vmv.v.i v23, 0");
+            }
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const scomplex* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 6:
+                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: __asm__("vfneg.v v11, v11");
+                        case 5: __asm__("vfneg.v v9, v9");
+                        case 4: __asm__("vfneg.v v7, v7");
+                        case 3: __asm__("vfneg.v v5, v5");
+                        case 2: __asm__("vfneg.v v3, v3");
+                        case 1: __asm__("vfneg.v v1, v1");
+                    }
+                }
+                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                else {
+                    switch (cdim) {
+                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v3, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v1, v1");
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG6
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+
+void bli_zpackm_sifive_x280_asm_6xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const dcomplex* kappa = kappa_;
+    const dcomplex* a = a_;
+    dcomplex* p = p_;
+
+    dcomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+        if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v0, 0");
+                    __asm__("vmv.v.i v1, 0");
+                case 1:
+                    __asm__("vmv.v.i v2, 0");
+                    __asm__("vmv.v.i v3, 0");
+                case 2:
+                    __asm__("vmv.v.i v4, 0");
+                    __asm__("vmv.v.i v5, 0");
+                case 3:
+                    __asm__("vmv.v.i v6, 0");
+                    __asm__("vmv.v.i v7, 0");
+                case 4:
+                    __asm__("vmv.v.i v8, 0");
+                    __asm__("vmv.v.i v9, 0");
+                case 5:
+                    __asm__("vmv.v.i v10, 0");
+                    __asm__("vmv.v.i v11, 0");
+            }
+        }
+        else {
+            switch (cdim) {
+                case 0:
+                    __asm__("vmv.v.i v12, 0");
+                    __asm__("vmv.v.i v13, 0");
+                case 1:
+                    __asm__("vmv.v.i v14, 0");
+                    __asm__("vmv.v.i v15, 0");
+                case 2:
+                    __asm__("vmv.v.i v16, 0");
+                    __asm__("vmv.v.i v17, 0");
+                case 3:
+                    __asm__("vmv.v.i v18, 0");
+                    __asm__("vmv.v.i v19, 0");
+                case 4:
+                    __asm__("vmv.v.i v20, 0");
+                    __asm__("vmv.v.i v21, 0");
+                case 5:
+                    __asm__("vmv.v.i v22, 0");
+                    __asm__("vmv.v.i v23, 0");
+            }
+        }
+        a += (cdim - 1) * inca;
+        size_t avl = n;
+        while (avl) {
+            const dcomplex* a_tmp = a;
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            switch (cdim) {
+                case 6:
+                    __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 5:
+                    __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 4:
+                    __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 3:
+                    __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 2:
+                    __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                    a_tmp -= inca;
+                case 1:
+                    __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: __asm__("vfneg.v v11, v11");
+                        case 5: __asm__("vfneg.v v9, v9");
+                        case 4: __asm__("vfneg.v v7, v7");
+                        case 3: __asm__("vfneg.v v5, v5");
+                        case 2: __asm__("vfneg.v v3, v3");
+                        case 1: __asm__("vfneg.v v1, v1");
+                    }
+                }
+                __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    switch (cdim) {
+                        case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                else {
+                    switch (cdim) {
+                        case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag);
+                        case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag);
+                        case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag);
+                        case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag);
+                    }
+                }
+                __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp));
+                __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp));
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v3, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v1, v1");
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v2, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v1, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
new file mode 100644
index 0000000000..89e05ecae3
--- /dev/null
+++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c
@@ -0,0 +1,838 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <riscv_vector.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define VSSSEG7 "vssseg7e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG5 "vssseg5e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG3 "vssseg3e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define NR 64
+
+void bli_spackm_sifive_x280_asm_64xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const float* kappa = kappa_;
+    const float* a = a_;
+    float* p = p_;
+
+    float kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v8, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const float* a_tmp = a;
+            float* p_tmp = p;
+            while (cdim_tmp >= 8) {
+                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast != 1.f) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                }
+                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                p_tmp += 8;
+                cdim_tmp -= 8;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.f) {
+                    switch (cdim_tmp) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 6:
+                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 5:
+                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 4:
+                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 3:
+                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 2:
+                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 1:
+                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v8, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.f) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define VSSSEG7 "vssseg7e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG5 "vssseg5e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG3 "vssseg3e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define NR 32
+
+void bli_dpackm_sifive_x280_asm_32xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) conja;
+    (void) cntx;
+    const double* kappa = kappa_;
+    const double* a = a_;
+    double* p = p_;
+
+    double kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v8, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const double* a_tmp = a;
+            double* p_tmp = p;
+            while (cdim_tmp >= 8) {
+                __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLE "v7, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast != 1.) {
+                    __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                    __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast));
+                }
+                __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                p_tmp += 8;
+                cdim_tmp -= 8;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VLE "v6, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 6:
+                        __asm__(VLE "v5, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 5:
+                        __asm__(VLE "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 4:
+                        __asm__(VLE "v3, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 3:
+                        __asm__(VLE "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLE "v1, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLE "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast != 1.) {
+                    switch (cdim_tmp) {
+                        case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast));
+                        case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast));
+                        case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast));
+                        case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast));
+                        case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast));
+                        case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast));
+                        case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+                    }
+                }
+                switch (cdim_tmp) {
+                    case 7:
+                        __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 6:
+                        __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 5:
+                        __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 4:
+                        __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 3:
+                        __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 2:
+                        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                    case 1:
+                        __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp));
+                        break;
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSE "v8, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v8, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == FLT_SIZE) {
+                __asm__(VLE "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast != 1.) {
+                __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast));
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSE "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef VSSSEG8
+#undef VSSSEG7
+#undef VSSSEG6
+#undef VSSSEG5
+#undef VSSSEG4
+#undef VSSSEG3
+#undef VSSSEG2
+#undef NR
+
+#define FLT_SIZE 4
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define VSSSEG4 "vssseg4e32.v "
+#define VSSSEG6 "vssseg6e32.v "
+#define VSSSEG8 "vssseg8e32.v "
+#define NR 32 
+
+void bli_cpackm_sifive_x280_asm_32xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const scomplex* kappa = kappa_;
+    const scomplex* a = a_;
+    scomplex* p = p_;
+
+    scomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v16, 0");
+        __asm__("vmv.v.i v18, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const scomplex* a_tmp = a;
+            scomplex* p_tmp = p;
+            while (cdim_tmp >= 4) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                        __asm__("vfneg.v v3, v3");
+                        __asm__("vfneg.v v5, v5");
+                        __asm__("vfneg.v v7, v7");
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                p_tmp += 4;
+                cdim_tmp -= 4;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v4, 0");
+        __asm__("vmv.v.i v6, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v2, v2");
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
+
+#undef FLT_SIZE
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef VSSSEG4
+#undef VSSSEG6
+#undef VSSSEG8
+#undef NR
+
+#define FLT_SIZE 8
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define VSSSEG4 "vssseg4e64.v "
+#define VSSSEG6 "vssseg6e64.v "
+#define VSSSEG8 "vssseg8e64.v "
+#define NR 16
+
+void bli_zpackm_sifive_x280_asm_16xk
+     (
+             conj_t           conja,
+             pack_t           schema,
+             dim_t            cdim,
+             dim_t            n,
+             dim_t            n_max,
+       const void*   restrict kappa_,
+       const void*   restrict a_, inc_t inca, inc_t lda,
+             void*   restrict p_,             inc_t ldp,
+       const cntx_t*          cntx
+     )
+{
+    (void) cntx;
+    const dcomplex* kappa = kappa_;
+    const dcomplex* a = a_;
+    dcomplex* p = p_;
+
+    dcomplex kappa_cast = *kappa;
+    if (lda == 1) {
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v16, 0");
+        __asm__("vmv.v.i v18, 0");
+        size_t avl = n;
+        while (avl) {
+            size_t vl;
+            __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE));
+            dim_t cdim_tmp = cdim;
+            const dcomplex* a_tmp = a;
+            dcomplex* p_tmp = p;
+            while (cdim_tmp >= 4) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp));
+                a_tmp += inca;
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        __asm__("vfneg.v v1, v1");
+                        __asm__("vfneg.v v3, v3");
+                        __asm__("vfneg.v v5, v5");
+                        __asm__("vfneg.v v7, v7");
+                    }
+                    __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    else {
+                        vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                        vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag);
+                    }
+                    __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                }
+                p_tmp += 4;
+                cdim_tmp -= 4;
+            }
+            if (cdim_tmp > 0) {
+                a_tmp += (cdim_tmp - 1) * inca;
+                switch (cdim_tmp) {
+                    case 3:
+                        __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 2:
+                        __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp));
+                        a_tmp -= inca;
+                    case 1:
+                        __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp));
+                }
+                if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                    if (conja == BLIS_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: __asm__("vfneg.v v5, v5");
+                            case 2: __asm__("vfneg.v v3, v3");
+                            case 1: __asm__("vfneg.v v1, v1");
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                else {
+                    if (conja == BLIS_NO_CONJUGATE) {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    else {
+                        switch (cdim_tmp) {
+                            case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag);
+                            case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag);
+                            case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag);
+                        }
+                    }
+                    switch (cdim_tmp) {
+                        case 3:
+                            __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 2:
+                            __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                        case 1:
+                            __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp));
+                            break;
+                    }
+                }
+                p_tmp += cdim_tmp;
+            }
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE));
+            for (size_t i = 0; i < vl; ++i) {
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp));
+                p_tmp += ldp;
+            }
+            a += vl;
+            p += vl * ldp;
+            avl -= vl;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v16, (%0)" : : "r"(p));
+            p += ldp;
+        }
+    }
+    else {
+        inca *= 2 * FLT_SIZE;
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        __asm__("vmv.v.i v4, 0");
+        __asm__("vmv.v.i v6, 0");
+        for (size_t i = 0; i < n; ++i) {
+            __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE));
+            if (inca == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(a));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca));
+            }
+            if (kappa_cast.real == 1. && kappa_cast.imag == 0.) {
+                if (conja == BLIS_CONJUGATE) {
+                    __asm__("vfneg.v v2, v2");
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            }
+            else {
+                if (conja == BLIS_NO_CONJUGATE) {
+                    vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                else {
+                    vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag);
+                }
+                __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(p));
+            }
+            a += lda;
+            p += ldp;
+        }
+        __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE));
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = n; i < n_max; ++i) {
+            __asm__(VSSEG2 "v0, (%0)" : : "r"(p));
+            p += ldp;  
+        }
+    }
+    return;
+}
diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
new file mode 100644
index 0000000000..b9715988d6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c
@@ -0,0 +1,2405 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../riscv_cmul_macros_asm.h"
+#include <math.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <riscv_vector.h>
+
+// byte-size of the floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLE "vle32.v "
+#define VLSE "vlse32.v "
+#define VSE "vse32.v "
+#define VSSE "vsse32.v "
+#define PACKMR 8
+#define PACKNR 64
+
+void bli_sgemm_7m4
+     (
+             dim_t           N,
+             dim_t           K,
+       const float* restrict alpha,
+       const float* restrict a,
+       const float* restrict b,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmul.vf v0, v28, ft0");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmul.vf v4, v28, ft1");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmul.vf v8, v28, ft2");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmul.vf v12, v28, ft3");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmul.vf v16, v28, ft4");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmul.vf v20, v28, ft5");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmul.vf v24, v28, ft6");
+
+            first = false;
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmacc.vf v0, ft0, v28");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmacc.vf v4, ft1, v28");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmacc.vf v8, ft2, v28");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmacc.vf v12, ft3, v28");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmacc.vf v16, ft4, v28");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmacc.vf v20, ft5, v28");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmacc.vf v24, ft6, v28");
+        }
+
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.f) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__("vfmul.vf v16, v16, ft10");
+        __asm__("vfmul.vf v20, v20, ft10");
+        __asm__("vfmul.vf v24, v24, ft10");
+    }
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        float *c_tmp = c;
+        if (csc == FLT_SIZE) { // c unit column stride
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c unit column stride
+        else { // c non-unit column stride
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c non-unit column stride
+    } // end beta != 0.f
+
+    // store c
+    if (csc == FLT_SIZE) {
+        __asm__(VSE "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v20, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v24, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_sgemm_7m4_cleanup
+     (
+             dim_t           M,
+             dim_t           N,
+             dim_t           K,
+       const float* restrict alpha,
+       const float* restrict a,
+       const float* restrict b,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmul.vf v20, v28, ft5");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmul.vf v16, v28, ft4");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmul.vf v12, v28, ft3");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmul.vf v8, v28, ft2");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmul.vf v4, v28, ft1");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmul.vf v0, v28, ft0");
+            }
+            first = false;
+        }
+        else {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmacc.vf v20, ft5, v28");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmacc.vf v16, ft4, v28");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmacc.vf v12, ft3, v28");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmacc.vf v8, ft2, v28");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmacc.vf v4, ft1, v28");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmacc.vf v0, ft0, v28");
+            }
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+     
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.f) {
+        switch (M) {
+        case 6:
+            __asm__("vfmul.vf v20, v20, ft10");
+        case 5:
+            __asm__("vfmul.vf v16, v16, ft10");
+        case 4:
+            __asm__("vfmul.vf v12, v12, ft10");
+        case 3:
+            __asm__("vfmul.vf v8, v8, ft10");
+        case 2:
+            __asm__("vfmul.vf v4, v4, ft10");
+        case 1:
+            __asm__("vfmul.vf v0, v0, ft10");
+        }
+    }
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        float *c_tmp = c;
+        if (csc == FLT_SIZE) {
+            switch (M) {
+            case 6:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 6:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.f
+
+    // store c
+    if (csc == FLT_SIZE) {
+        switch (M) {
+        case 6:
+            __asm__(VSE "v20, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSE "v16, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSE "v12, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSE "v8, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSE "v4, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSE "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+        case 6:
+            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+    return;
+}
+
+void bli_sgemm_7m4_k0
+     (
+             dim_t           M,
+             dim_t           N,
+       const float* restrict beta,
+             float* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    if (*beta == 0.f) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }        
+        } // end c non-unit column stride
+    } // end beta == 0.f
+    else { // beta != 0.f
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLE "v24, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSE "v24, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLE "v20, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSE "v20, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLE "v16, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSE "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLE "v12, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSE "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLE "v8, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSE "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLE "v4, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSE "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLE "v0, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.f
+    return;
+}
+
+void bli_sgemm_sifive_x280_asm_7m4
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    (void) data;
+    (void) cntx;
+    const float* restrict alpha = alpha_;
+    const float* restrict beta = beta_;
+    const float* restrict a = a_;
+    const float* restrict b = b_;
+    float* restrict c = c_;
+
+    // M x N x K sgemm
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc);
+    else if (M == 7)
+        bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of the floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLE "vle64.v "
+#define VLSE "vlse64.v "
+#define VSE "vse64.v "
+#define VSSE "vsse64.v "
+#define PACKMR 8
+#define PACKNR 32 
+
+void bli_dgemm_7m4
+     (
+             dim_t            N,
+             dim_t            K,
+       const double* restrict alpha,
+       const double* restrict a,
+       const double* restrict b,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmul.vf v0, v28, ft0");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmul.vf v4, v28, ft1");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmul.vf v8, v28, ft2");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmul.vf v12, v28, ft3");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmul.vf v16, v28, ft4");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmul.vf v20, v28, ft5");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmul.vf v24, v28, ft6");
+
+            first = false;
+        }
+        else {
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__("vfmacc.vf v0, ft0, v28");
+
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            __asm__("vfmacc.vf v4, ft1, v28");
+
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__("vfmacc.vf v8, ft2, v28");
+
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            __asm__("vfmacc.vf v12, ft3, v28");
+
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__("vfmacc.vf v16, ft4, v28");
+
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            __asm__("vfmacc.vf v20, ft5, v28");
+
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__("vfmacc.vf v24, ft6, v28");
+        }
+
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.) {
+        __asm__("vfmul.vf v0, v0, ft10");
+        __asm__("vfmul.vf v4, v4, ft10");
+        __asm__("vfmul.vf v8, v8, ft10");
+        __asm__("vfmul.vf v12, v12, ft10");
+        __asm__("vfmul.vf v16, v16, ft10");
+        __asm__("vfmul.vf v20, v20, ft10");
+        __asm__("vfmul.vf v24, v24, ft10");
+    }
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        double *c_tmp = c;
+        if (csc == FLT_SIZE) { // c unit column stride
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c unit column stride
+        else { // c non-unit column stride
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v0, v0, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v0, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v4, v4, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v4, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v8, v8, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v8, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v12, v12, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v12, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v16, v16, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v16, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v20, v20, ft10");
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__("vfmacc.vf v20, ft11, v28");
+
+            __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("vfmul.vf v24, v24, ft10");
+            __asm__("vfmacc.vf v24, ft11, v28");
+        } // end c non-unit column stride
+    } // end beta != 0.
+
+    // store c
+    if (csc == FLT_SIZE) {
+        __asm__(VSE "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v20, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSE "v24, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_dgemm_7m4_cleanup
+     (
+             dim_t            M,
+             dim_t            N,
+             dim_t            K,
+       const double* restrict alpha,
+       const double* restrict a,
+       const double* restrict b,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0
+    __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    bool first = true;
+    // compute a*b
+    for (dim_t k = 0; k < K; ++k) {
+        __asm__(VLE "v28, (%0)" : : "r"(b));
+        if (first) {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmul.vf v20, v28, ft5");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmul.vf v16, v28, ft4");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmul.vf v12, v28, ft3");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmul.vf v8, v28, ft2");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmul.vf v4, v28, ft1");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmul.vf v0, v28, ft0");
+            }
+            first = false;
+        }
+        else {
+            switch (M) {
+            case 6:
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                __asm__("vfmacc.vf v20, ft5, v28");
+            case 5:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__("vfmacc.vf v16, ft4, v28");
+            case 4:
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                __asm__("vfmacc.vf v12, ft3, v28");
+            case 3:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__("vfmacc.vf v8, ft2, v28");
+            case 2:
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                __asm__("vfmacc.vf v4, ft1, v28");
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__("vfmacc.vf v0, ft0, v28");
+            }
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+     
+    __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha));
+    
+    // compute alpha*a*b + beta*c
+    if (*beta == 0.) {
+        switch (M) {
+        case 6:
+            __asm__("vfmul.vf v20, v20, ft10");
+        case 5:
+            __asm__("vfmul.vf v16, v16, ft10");
+        case 4:
+            __asm__("vfmul.vf v12, v12, ft10");
+        case 3:
+            __asm__("vfmul.vf v8, v8, ft10");
+        case 2:
+            __asm__("vfmul.vf v4, v4, ft10");
+        case 1:
+            __asm__("vfmul.vf v0, v0, ft10");
+        }
+    }
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta));
+        double *c_tmp = c;
+        if (csc == FLT_SIZE) {
+            switch (M) {
+            case 6:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLE "v28, (%0)" : : "r"(c_tmp));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 6:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v20, ft11, v28");
+            case 5:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v16, ft11, v28");
+            case 4:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v12, ft11, v28");
+            case 3:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v8, ft11, v28");
+            case 2:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft10");
+                __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                __asm__("vfmacc.vf v4, ft11, v28");
+            case 1:
+                __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft10");
+                __asm__("vfmacc.vf v0, ft11, v28");
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.
+
+    // store c
+    if (csc == FLT_SIZE) {
+        switch (M) {
+        case 6:
+            __asm__(VSE "v20, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSE "v16, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSE "v12, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSE "v8, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSE "v4, (%0)" : : "r"(c));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSE "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+        case 6:
+            __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 5:
+            __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 4:
+            __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 3:
+            __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 2:
+            __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+            __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+        case 1:
+            __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+    return;
+}
+
+void bli_dgemm_7m4_k0
+     (
+             dim_t            M,
+             dim_t            N,
+       const double* restrict beta,
+             double* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    c += (M - 1) * rsc;
+    rsc *= FLT_SIZE;
+    csc *= FLT_SIZE;
+    if (*beta == 0.) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }        
+        } // end c non-unit column stride
+    } // end beta == 0.
+    else { // beta != 0.
+        __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta));
+        if (csc == FLT_SIZE) { // c unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLE "v24, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSE "v24, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLE "v20, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSE "v20, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLE "v16, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSE "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLE "v12, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSE "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLE "v8, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSE "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLE "v4, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSE "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLE "v0, (%0)" : : "r"(c));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSE "v0, (%0)" : : "r"(c));
+                
+            }
+        } // end c unit column stride
+        else { // c non-unit column stride
+            switch (M) {
+            case 7:
+                __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v24, v24, ft0");
+                __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 6:
+                __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v20, v20, ft0");
+                __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 5:
+                __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v16, v16, ft0");
+                __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v12, v12, ft0");
+                __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v8, v8, ft0");
+                __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v4, v4, ft0");
+                __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("vfmul.vf v0, v0, ft0");
+                __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc));
+            }
+        } // end c non-unit column stride
+    } // end beta != 0.
+    return;
+}
+
+void bli_dgemm_sifive_x280_asm_7m4
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    (void) data;
+    (void) cntx;
+    const double* restrict alpha = alpha_;
+    const double* restrict beta = beta_;
+    const double* restrict a = a_;
+    const double* restrict b = b_;
+    double* restrict c = c_;
+
+    // M x N x K dgemm
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc);
+    else if (M == 7)
+        bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLE
+#undef VLSE
+#undef VSE
+#undef VSSE
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 4
+#define FLT_LOAD "flw "
+#define VLSEG2 "vlseg2e32.v "
+#define VLSSEG2 "vlsseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define PACKMR 8
+#define PACKNR 32 
+
+void bli_cgemm_6m2
+     (
+             dim_t              N,
+             dim_t              K,
+       const scomplex* restrict alpha,
+       const scomplex* restrict a,
+       const scomplex* restrict b,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 6 x N x K cgemm, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    __asm__("vfmul.vf v24, v2, ft1");
+    __asm__("vfmul.vf v26, v0, ft1");
+    __asm__("vfmul.vf v28, v6, ft1");
+    __asm__("vfmul.vf v30, v4, ft1");
+
+    __asm__("vfmsub.vf v0, ft0, v24");
+    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmsub.vf v4, ft0, v28");
+    __asm__("vfmadd.vf v6, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v10, ft1");
+    __asm__("vfmul.vf v26, v8, ft1");
+    __asm__("vfmul.vf v28, v14, ft1");
+    __asm__("vfmul.vf v30, v12, ft1");
+
+    __asm__("vfmsub.vf v8, ft0, v24");
+    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmsub.vf v12, ft0, v28");
+    __asm__("vfmadd.vf v14, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v18, ft1");
+    __asm__("vfmul.vf v26, v16, ft1");
+    __asm__("vfmul.vf v28, v22, ft1");
+    __asm__("vfmul.vf v30, v20, ft1");
+
+    __asm__("vfmsub.vf v16, ft0, v24");
+    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmsub.vf v20, ft0, v28");
+    __asm__("vfmadd.vf v22, ft0, v30"); 
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+        if (csc == 2 * FLT_SIZE) {
+            scomplex *c_tmp = c;
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+        else {
+            scomplex *c_tmp = c;
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_cgemm_6m2_cleanup
+     (
+             dim_t              M,
+             dim_t              N,
+             dim_t              K,
+       const scomplex* restrict alpha,
+       const scomplex* restrict a,
+       const scomplex* restrict b,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    switch (M) {
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+        case 1:
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+    }
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    switch (M) {
+        case 5:
+            __asm__("vfmul.vf v24, v18, ft1");
+            __asm__("vfmul.vf v26, v16, ft1");
+            __asm__("vfmsub.vf v16, ft0, v24");
+            __asm__("vfmadd.vf v18, ft0, v26"); 
+        case 4:
+            __asm__("vfmul.vf v28, v14, ft1");
+            __asm__("vfmul.vf v30, v12, ft1");
+            __asm__("vfmsub.vf v12, ft0, v28");
+            __asm__("vfmadd.vf v14, ft0, v30"); 
+        case 3:
+            __asm__("vfmul.vf v24, v10, ft1");
+            __asm__("vfmul.vf v26, v8, ft1");
+            __asm__("vfmsub.vf v8, ft0, v24");
+            __asm__("vfmadd.vf v10, ft0, v26"); 
+        case 2:
+            __asm__("vfmul.vf v28, v6, ft1");
+            __asm__("vfmul.vf v30, v4, ft1");
+            __asm__("vfmsub.vf v4, ft0, v28");
+            __asm__("vfmadd.vf v6, ft0, v30"); 
+        case 1:
+            __asm__("vfmul.vf v24, v2, ft1");
+            __asm__("vfmul.vf v26, v0, ft1");
+            __asm__("vfmsub.vf v0, ft0, v24");
+            __asm__("vfmadd.vf v2, ft0, v26"); 
+    } 
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real != 0.f || beta_cast.imag != 0.f) {
+        if (csc == 2 * FLT_SIZE) {
+            scomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+        else {
+            scomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        switch (M) {
+            case 5:
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+            case 5:
+                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+
+    return;
+}
+
+void bli_cgemm_6m2_k0
+     (
+             dim_t              M,
+             dim_t              N,
+       const scomplex* restrict beta,
+             scomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    csc *= 2 * FLT_SIZE;
+
+    scomplex beta_cast = *beta;
+    if (beta_cast.real == 0.f && beta_cast.imag == 0.f) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+            c += rsc;
+        }
+    }
+    else {
+        // scale c by beta
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+            }
+            c += rsc;
+        }
+    }
+    return;
+}
+
+void bli_cgemm_sifive_x280_asm_6m2
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    // M x N x K cgemm 
+    (void) data;
+    (void) cntx;
+    const scomplex* restrict alpha = alpha_;
+    const scomplex* restrict beta = beta_;
+    const scomplex* restrict a = a_;
+    const scomplex* restrict b = b_;
+    scomplex* restrict c = c_;
+
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc);
+    else if (M == 6)
+        bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
+
+// byte-size of underlying floating point type
+#define FLT_SIZE 8
+#define FLT_LOAD "fld "
+#define VLSEG2 "vlseg2e64.v "
+#define VLSSEG2 "vlsseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define PACKMR 8
+#define PACKNR 16 
+
+void bli_zgemm_6m2
+     (
+             dim_t              N,
+             dim_t              K,
+       const dcomplex* restrict alpha,
+       const dcomplex* restrict a,
+       const dcomplex* restrict b,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 6 x N x K zgemm, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+    vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+    vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+    vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+    vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+    vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+    vcmul_vf(v20, v22, v24, v26, ft10, ft11);
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v28, v30);
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+        vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+
+        __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+        vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+
+        __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+        vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+
+        __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+        vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+
+        __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+        vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+
+        __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE));
+        __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE));
+        vcmacc_vf(v20, v22, ft10, ft11, v24, v26);
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    __asm__("vfmul.vf v24, v2, ft1");
+    __asm__("vfmul.vf v26, v0, ft1");
+    __asm__("vfmul.vf v28, v6, ft1");
+    __asm__("vfmul.vf v30, v4, ft1");
+
+    __asm__("vfmsub.vf v0, ft0, v24");
+    __asm__("vfmadd.vf v2, ft0, v26"); 
+    __asm__("vfmsub.vf v4, ft0, v28");
+    __asm__("vfmadd.vf v6, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v10, ft1");
+    __asm__("vfmul.vf v26, v8, ft1");
+    __asm__("vfmul.vf v28, v14, ft1");
+    __asm__("vfmul.vf v30, v12, ft1");
+
+    __asm__("vfmsub.vf v8, ft0, v24");
+    __asm__("vfmadd.vf v10, ft0, v26"); 
+    __asm__("vfmsub.vf v12, ft0, v28");
+    __asm__("vfmadd.vf v14, ft0, v30"); 
+
+    __asm__("vfmul.vf v24, v18, ft1");
+    __asm__("vfmul.vf v26, v16, ft1");
+    __asm__("vfmul.vf v28, v22, ft1");
+    __asm__("vfmul.vf v30, v20, ft1");
+
+    __asm__("vfmsub.vf v16, ft0, v24");
+    __asm__("vfmadd.vf v18, ft0, v26"); 
+    __asm__("vfmsub.vf v20, ft0, v28");
+    __asm__("vfmadd.vf v22, ft0, v30"); 
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+        if (csc == 2 * FLT_SIZE) {
+            dcomplex *c_tmp = c;
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+        else {
+            dcomplex *c_tmp = c;
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+
+            __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+            vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+
+            __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+            vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+
+            vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30);
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSEG2 "v20, (%0)" : : "r"(c));
+    }
+    else {
+        __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+        __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc));
+        __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc));
+    }
+
+    return;
+}
+
+void bli_zgemm_6m2_cleanup
+     (
+             dim_t              M,
+             dim_t              N,
+             dim_t              K,
+       const dcomplex* restrict alpha,
+       const dcomplex* restrict a,
+       const dcomplex* restrict b,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0
+    // pairs of register groups hold the real and imag. parts of rows of c and b
+
+    __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+    __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    if (K >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    switch (M) {
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+            vcmul_vf(v16, v18, v24, v26, ft8, ft9);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+            vcmul_vf(v12, v14, v24, v26, ft6, ft7);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+            vcmul_vf(v8, v10, v24, v26, ft4, ft5);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+            vcmul_vf(v4, v6, v24, v26, ft2, ft3);
+        case 1:
+            __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+            vcmul_vf(v0, v2, v24, v26, ft0, ft1);
+    }
+    K -= 1;
+
+    if (K >= 2) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+        __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+    while (K > 0) {
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        K -= 1;
+
+        if (K == 0) { break; }
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (M) {
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE));
+                vcmacc_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE));
+                vcmacc_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE));
+                vcmacc_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE));
+                vcmacc_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE));
+                vcmacc_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        K -= 1;
+
+        if (K >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b));
+            __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    c += (M - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE));
+
+    switch (M) {
+        case 5:
+            __asm__("vfmul.vf v24, v18, ft1");
+            __asm__("vfmul.vf v26, v16, ft1");
+            __asm__("vfmsub.vf v16, ft0, v24");
+            __asm__("vfmadd.vf v18, ft0, v26"); 
+        case 4:
+            __asm__("vfmul.vf v28, v14, ft1");
+            __asm__("vfmul.vf v30, v12, ft1");
+            __asm__("vfmsub.vf v12, ft0, v28");
+            __asm__("vfmadd.vf v14, ft0, v30"); 
+        case 3:
+            __asm__("vfmul.vf v24, v10, ft1");
+            __asm__("vfmul.vf v26, v8, ft1");
+            __asm__("vfmsub.vf v8, ft0, v24");
+            __asm__("vfmadd.vf v10, ft0, v26"); 
+        case 2:
+            __asm__("vfmul.vf v28, v6, ft1");
+            __asm__("vfmul.vf v30, v4, ft1");
+            __asm__("vfmsub.vf v4, ft0, v28");
+            __asm__("vfmadd.vf v6, ft0, v30"); 
+        case 1:
+            __asm__("vfmul.vf v24, v2, ft1");
+            __asm__("vfmul.vf v26, v0, ft1");
+            __asm__("vfmsub.vf v0, ft0, v24");
+            __asm__("vfmadd.vf v2, ft0, v26"); 
+    } 
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real != 0. || beta_cast.imag != 0.) {
+        if (csc == 2 * FLT_SIZE) {
+            dcomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+        else {
+            dcomplex *c_tmp = c;
+            switch (M) {
+                case 5:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26);
+                case 4:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30);
+                case 3:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26);
+                case 2:
+                    __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc));
+                    vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30);
+                case 1:
+                    __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc));
+                    vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26);
+            }
+        }
+    }
+
+    if (csc == 2 * FLT_SIZE) {
+        switch (M) {
+            case 5:
+                __asm__(VSSEG2 "v16, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSEG2 "v12, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSEG2 "v8, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+        }
+    }
+    else {
+        switch (M) {
+            case 5:
+                __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 4:
+                __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 3:
+                __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 2:
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+                __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc));
+            case 1:
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+        }
+    }
+
+    return;
+}
+
+void bli_zgemm_6m2_k0
+     (
+             dim_t              M,
+             dim_t              N,
+       const dcomplex* restrict beta,
+             dcomplex* restrict c, inc_t rsc, inc_t csc
+     )
+{
+    // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0
+    // This may not produce the same result as the reference kernel if alpha is infinite or NaN.
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE));
+    csc *= 2 * FLT_SIZE;
+
+    dcomplex beta_cast = *beta;
+    if (beta_cast.real == 0. && beta_cast.imag == 0.) {
+        // set c to 0
+        __asm__("vmv.v.i v0, 0");
+        __asm__("vmv.v.i v2, 0");
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE)
+                __asm__(VSSEG2 "v0, (%0)" : : "r"(c));
+            else
+                __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+            c += rsc;
+        }
+    }
+    else {
+        // scale c by beta
+        for (size_t i = 0; i < M; ++i) {
+            if (csc == 2 * FLT_SIZE) {
+                __asm__(VLSEG2 "v0, (%0)" : : "r"(c));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSEG2 "v4, (%0)" : : "r"(c));
+            }
+            else {
+                __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc));
+                vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag);
+                __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc));
+            }
+            c += rsc;
+        }
+    }
+    return;
+}
+
+void bli_zgemm_sifive_x280_asm_6m2
+     (
+             dim_t               M,
+             dim_t               N,
+             dim_t               K,
+       const void*      restrict alpha_,
+       const void*      restrict a_,
+       const void*      restrict b_,
+       const void*      restrict beta_,
+             void*      restrict c_, inc_t rsc, inc_t csc,
+             auxinfo_t* restrict data,
+       const cntx_t*    restrict cntx
+     )
+{
+    // M x N x K zgemm 
+    (void) data;
+    (void) cntx;
+    const dcomplex* restrict alpha = alpha_;
+    const dcomplex* restrict beta = beta_;
+    const dcomplex* restrict a = a_;
+    const dcomplex* restrict b = b_;
+    dcomplex* restrict c = c_;
+
+    if (M <= 0 || N <= 0 || K < 0)
+        return;
+    else if (K == 0)
+        bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc);
+    else if (M == 6)
+        bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc);
+    else
+        bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc);
+    return;
+}
+
+#undef FLT_SIZE
+#undef FLT_LOAD
+#undef VLSEG2
+#undef VLSSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef PACKMR
+#undef PACKNR
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
new file mode 100644
index 0000000000..18df010d05
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c
@@ -0,0 +1,327 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    (void) data;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (m <= 0 || n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+    DATATYPE alpha_cast = *alpha;
+    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+        switch (m) {
+            case 6:
+                __asm__("vmv.v.i v20, 0");
+                __asm__("vmv.v.i v22, 0");
+            case 5:
+                __asm__("vmv.v.i v16, 0");
+                __asm__("vmv.v.i v18, 0");
+            case 4:
+                __asm__("vmv.v.i v12, 0");
+                __asm__("vmv.v.i v14, 0");
+            case 3:
+                __asm__("vmv.v.i v8, 0");
+                __asm__("vmv.v.i v10, 0");
+            case 2:
+                __asm__("vmv.v.i v4, 0");
+                __asm__("vmv.v.i v6, 0");
+            case 1:
+                __asm__("vmv.v.i v0, 0");
+                __asm__("vmv.v.i v2, 0");
+        }
+    }
+    else {
+        const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR;
+        switch (m) {
+            case 6:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 5:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 4:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 3:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 2:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE));
+            case 1:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+        }
+    }
+
+    if (k >= 1) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    if (k >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+        __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    while (k > 0) {
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        k -= 1;
+
+        if (k == 0) { break; }
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b01));
+            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        k -= 1;
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b01));
+            __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 1) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE));
+    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 2) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE));
+    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 3) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE));
+    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 4) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE));
+    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 5) return;
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE));
+    __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE));
+    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    return;
+}
+
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
new file mode 100644
index 0000000000..a0f9134731
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c
@@ -0,0 +1,253 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a10 = a10_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b01 = b01_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+        return;
+
+    dim_t b11_offset, temp;
+    size_t vl;
+    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+    
+    // Multiply step sizes by data size
+    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+  
+    __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1));
+    __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE));
+    __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp));
+    // b11_offset = (m-1)*PACKNR*FLT_SIZE
+
+    __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset));
+    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));  // TO DO: optimize alpha = 1 case
+    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11)); 
+                __asm__("vfmul.vf  v0,  v0, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v4,  v4, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v8,  v8, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+                 __asm__("vfmul.vf v12, v12, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v16, v16, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v20, v20, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v24, v24, f0");
+                // no sub of b11 on final entry
+    }
+    // b11 now reset to original value
+    //  v0 = row 6 of b11
+    //  v4 = row 5 of b11
+    //  v8 = row 4 of b11
+    // v12 = row 3 of b11
+    // v16 = row 2 of b11
+    // v20 = row 1 of b11
+    // v24 = row 0 of b11
+
+    // GEMM: B11 := alpha * B11 - A10 * B01
+    for (dim_t i = 0; i < k; i++){
+        __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01
+        switch (m){
+            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v0, f6, v28");
+            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v4, f5, v28");
+            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf  v8, f4, v28");
+            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v12, f3, v28");
+            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v16, f2, v28");
+            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10));
+                    __asm__("vfnmsac.vf v20, f1, v28");
+            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10));
+                 __asm__("vfnmsac.vf v24, f0, v28");
+        }
+        __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE));
+        __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE));
+    }
+    // TRSM: B11 := inv(A11) * B11
+    // TO DO: Investigate code size reduction (loop rerolling)
+
+    // Row 0
+    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(0*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v24, v24, f0");
+    __asm__(VSE " v24, (%0)": : "r"(b11));
+    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 1) return;
+
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v24");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v24");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v24");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v24");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v24");
+        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v20, f1, v24");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 1
+    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(1*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v20, v20, f1");
+    __asm__(VSE " v20, (%0)": : "r"(b11));
+    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 2) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v20");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v20");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v20");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v20");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v20");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 2
+    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(2*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v16, v16, f2");
+    __asm__(VSE " v16, (%0)": : "r"(b11));
+    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 3) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v16");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v16");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v16");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v16");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 3
+    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(3*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v12, v12, f3");
+    __asm__(VSE " v12, (%0)": : "r"(b11));
+    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 4) return;
+  
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v12");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v12");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v12");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 4
+    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(4*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v8, v8, f4");
+    __asm__(VSE " v8, (%0)": : "r"(b11));
+    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 5) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v8");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v8");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 5
+    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(5*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v4, v4, f5");
+    __asm__(VSE " v4, (%0)": : "r"(b11));
+    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 6) return;
+    
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+    __asm__("vfnmsac.vf v0, f6, v4");
+    
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 6
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(6*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v0, v0, f6");
+    __asm__(VSE " v0, (%0)": : "r"(b11));
+    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
new file mode 100644
index 0000000000..4323f8fbf6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c
@@ -0,0 +1,182 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#include "blis.h"
+#include "../../riscv_cmul_macros_asm.h"
+#include <stdint.h>
+#include <riscv_vector.h>
+
+#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\
+          dim_t               m,      \
+          dim_t               n,      \
+          dim_t               k,      \
+    const T*         restrict alpha_, \
+    const T*         restrict a10_,   \
+    const T*         restrict a11_,   \
+    const T*         restrict b01_,   \
+          T*         restrict b11_,   \
+          T*         restrict c11_,   \
+          inc_t               rsc,    \
+          inc_t               csc,    \
+          auxinfo_t* restrict data,   \
+    const cntx_t*    restrict cntx    \
+    )
+
+#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\
+          dim_t               m,      \
+          dim_t               n,      \
+          dim_t               k,      \
+    const T*         restrict alpha_, \
+    const T*         restrict a12_,   \
+    const T*         restrict a11_,   \
+    const T*         restrict b21_,   \
+          T*         restrict b11_,   \
+          T*         restrict c11_,   \
+          inc_t               rsc,    \
+          inc_t               csc,    \
+          auxinfo_t* restrict data,   \
+    const cntx_t*    restrict cntx    \
+    )
+
+#define GEMMTRSM(macro, ...)  macro(__VA_ARGS__)
+
+// Single precision real
+#define DATATYPE float
+#define PRECISION_CHAR s
+#define PACKMR 8
+#define PACKNR 64
+#define VLE "vle32.v"
+#define VSE "vse32.v"
+#define VSSE "vsse32.v"
+#define FLT_LOAD "flw"
+#define FLT_SIZE sizeof(float)
+#define LOG_FLT_SIZE 2
+
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Double precision real
+#define DATATYPE double
+#define PRECISION_CHAR d
+#define PACKMR 8
+#define PACKNR 32
+#define VLE "vle64.v"
+#define VSE "vse64.v"
+#define VSSE "vsse64.v"
+#define FLT_LOAD "fld"
+#define FLT_SIZE sizeof(double)
+#define LOG_FLT_SIZE 3
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLE
+#undef VSE
+#undef VSSE
+#undef FLT_LOAD
+#undef FLT_SIZE
+#undef LOG_FLT_SIZE
+
+// Single precision complex
+#define DATATYPE scomplex
+#define PRECISION_CHAR c
+#define PACKMR 8
+#define PACKNR 32
+#define VLSEG2 "vlseg2e32.v "
+#define VSSEG2 "vsseg2e32.v "
+#define VSSSEG2 "vssseg2e32.v "
+#define FLT_LOAD "flw "
+#define FLT_SIZE sizeof(float)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG2
+#undef VSSEG2
+#undef VSSSEG2
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+// Double precision complex
+#define DATATYPE dcomplex
+#define PRECISION_CHAR z
+#define PACKMR 8
+#define PACKNR 16
+#define VLSEG2 "vlseg2e64.v "
+#define VSSEG2 "vsseg2e64.v "
+#define VSSSEG2 "vssseg2e64.v "
+#define FLT_LOAD "fld "
+#define FLT_SIZE sizeof(double)
+
+#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c"
+#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c"
+
+#undef DATATYPE
+#undef PRECISION_CHAR
+#undef PACKMR
+#undef PACKNR
+#undef VLSEG
+#undef VSSEG
+#undef VSSSEG
+#undef FLT_LOAD
+#undef FLT_SIZE
+
+
+
+#undef GEMMTRSM
+#undef GEMMTRSM_L
+#undef GEMMTRSM_U
+
+
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
new file mode 100644
index 0000000000..9332fd0963
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c
@@ -0,0 +1,331 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    (void) data;
+    (void) cntx;
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+
+    if (m <= 0 || n <= 0)
+        return;
+
+    __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE));
+
+    DATATYPE alpha_cast = *alpha;
+    if (alpha_cast.real == 0 && alpha_cast.imag == 0) {
+        switch (m) {
+            case 6:
+                __asm__("vmv.v.i v20, 0");
+                __asm__("vmv.v.i v22, 0");
+            case 5:
+                __asm__("vmv.v.i v16, 0");
+                __asm__("vmv.v.i v18, 0");
+            case 4:
+                __asm__("vmv.v.i v12, 0");
+                __asm__("vmv.v.i v14, 0");
+            case 3:
+                __asm__("vmv.v.i v8, 0");
+                __asm__("vmv.v.i v10, 0");
+            case 2:
+                __asm__("vmv.v.i v4, 0");
+                __asm__("vmv.v.i v6, 0");
+            case 1:
+                __asm__("vmv.v.i v0, 0");
+                __asm__("vmv.v.i v2, 0");
+        }
+    }
+    else {
+        const DATATYPE* b11_tmp = b11;
+        switch (m) {
+            case 6:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 5:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 4:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 3:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 2:
+                __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag);
+                __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE));
+            case 1:
+                __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp));
+                vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag);
+        }
+    }
+
+    if (k >= 1) {
+        __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+    if (k >= 2) {
+        __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+        __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+    }
+
+    a12 += m - 1;
+
+    while (k > 0) {
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v24, v26);
+        }
+        k -= 1;
+
+        if (k == 0) { break; }
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v24, (%0)" : : "r"(b21));
+            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+
+        switch (m) {
+            case 6:
+                __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE));
+                vcnmsac_vf(v20, v22, ft10, ft11, v28, v30);
+            case 5:
+                __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE));
+                vcnmsac_vf(v16, v18, ft8, ft9, v28, v30);
+            case 4:
+                __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE));
+                vcnmsac_vf(v12, v14, ft6, ft7, v28, v30);
+            case 3:
+                __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE));
+                vcnmsac_vf(v8, v10, ft4, ft5, v28, v30);
+            case 2:
+                __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE));
+                vcnmsac_vf(v4, v6, ft2, ft3, v28, v30);
+            case 1:
+                __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE));
+                __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE));
+                vcnmsac_vf(v0, v2, ft0, ft1, v28, v30);
+        }
+        k -= 1;
+
+        if (k >= 2) {
+            __asm__(VLSEG2 "v28, (%0)" : : "r"(b21));
+            __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE));
+        }
+        __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE));
+    }
+
+    a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR
+    b11 += (m - 1) * PACKNR;
+    c11 += (m - 1) * rsc;
+    rsc *= 2 * FLT_SIZE;
+    csc *= 2 * FLT_SIZE;
+
+    __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v0, v2, ft0, ft1);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 1) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+        case 2:
+            __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+            vcnmsac_vf(v4, v6, ft2, ft3, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE));
+    vcmul_vf(v24, v26, v4, v6, ft2, ft3);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 2) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+        case 3:
+            __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+            vcnmsac_vf(v8, v10, ft4, ft5, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE));
+    vcmul_vf(v24, v26, v8, v10, ft4, ft5);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 3) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+        case 4:
+            __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+            vcnmsac_vf(v12, v14, ft6, ft7, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE));
+    vcmul_vf(v24, v26, v12, v14, ft6, ft7);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 4) return;
+
+    switch (m) {
+        case 6:
+            __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+            vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+        case 5:
+            __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+            __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+            vcnmsac_vf(v16, v18, ft8, ft9, v24, v26);
+    }
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE));
+    vcmul_vf(v24, v26, v16, v18, ft8, ft9);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    if (m == 5) return;
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+    vcnmsac_vf(v20, v22, ft10, ft11, v24, v26);
+
+    __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE));
+    __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE));
+    __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc));
+
+    __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE));
+    __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE));
+    vcmul_vf(v24, v26, v20, v22, ft10, ft11);
+    __asm__(VSSEG2 "v24, (%0)" : : "r"(b11));
+    __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc));
+
+    return;
+}
+#endif
diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
new file mode 100644
index 0000000000..2d511a8ba6
--- /dev/null
+++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c
@@ -0,0 +1,260 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// clang-format off
+#ifdef GEMMTRSM
+
+GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void)
+{
+    const DATATYPE* restrict alpha = alpha_;
+    const DATATYPE* restrict a12 = a12_;
+    const DATATYPE* restrict a11 = a11_;
+    const DATATYPE* restrict b21 = b21_;
+    const DATATYPE* restrict b11 = b11_;
+    DATATYPE* restrict c11 = c11_;
+    
+    if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR))
+        return;
+    
+    dim_t m_sz, a11_offset, c11_offset, temp;
+    size_t vl;
+    __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE));
+
+    // Multiply step sizes by data size
+    __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE));
+    __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE));
+    
+    __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); 
+    __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp));
+    __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE));
+    __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc));
+    __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc));
+    // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE
+    // c11_offset = rsc*(m-1)*sz
+    
+    __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha));
+    switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha
+        case 7: __asm__(VLE "  v0, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v0,  v0, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 6: __asm__(VLE "  v4, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v4,  v4, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 5: __asm__(VLE "  v8, (%0)": : "r"(b11));
+                __asm__("vfmul.vf  v8,  v8, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 4: __asm__(VLE " v12, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v12, v12, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 3: __asm__(VLE " v16, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v16, v16, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 2: __asm__(VLE " v20, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v20, v20, f0");
+                __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE));
+        case 1: __asm__(VLE " v24, (%0)": : "r"(b11));
+                __asm__("vfmul.vf v24, v24, f0");
+                // no add of b11 on final entry
+    }
+    // b11 now positioned at start of last row
+    // v24 = row 0 from bottom (bottom row)
+    // v20 = row 1 from bottom
+    // v16 = row 2 from bottom
+    // v12 = row 3 from bottom
+    //  v8 = row 4 from bottom
+    //  v4 = row 5 from bottom
+    //  v0 = row 6 from bottom
+
+    // GEMM: B11 := alpha * B11 - A12 * B21
+    __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz));
+    for (dim_t i = 0; i < k; i++){
+        __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21
+        switch (m){
+            case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v0, f6, v28");
+            case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v4, f5, v28");
+            case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf  v8, f4, v28");
+            case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v12, f3, v28");
+            case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v16, f2, v28");
+            case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v20, f1, v28");
+            case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12));
+                    __asm__("vfnmsac.vf v24, f0, v28");
+    }
+    __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE));
+    }
+    // TRSM: B11 := inv(A11) * B11
+    // Move a11 to end of array and c11 to first entry in last row
+    __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset));
+    __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset));
+
+    // Row 0 from bottom (bottom row)
+    __asm__(FLT_LOAD " f0,  %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v24, v24, f0");
+    __asm__(VSE " v24, (%0)": : "r"(b11));
+    __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 1) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v24");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v24");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v24");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v24");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v24");
+        case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v20, f1, v24");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 1 from bottom
+    __asm__(FLT_LOAD " f1,  %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v20, v20, f1");
+    __asm__(VSE " v20, (%0)": : "r"(b11));
+    __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 2) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v20");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v20");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v20");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v20");
+        case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v16, f2, v20");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+
+    // Row 2 from bottom
+    __asm__(FLT_LOAD " f2,  %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v16, v16, f2");
+    __asm__(VSE " v16, (%0)": : "r"(b11));
+    __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 3) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v16");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v16");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v16");
+        case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf v12, f3, v16");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 3 from bottom
+    __asm__(FLT_LOAD " f3,  %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v12, v12, f3");
+    __asm__(VSE " v12, (%0)": : "r"(b11));
+    __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 4) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v12");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v12");
+        case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v8, f4, v12");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 4 from bottom
+    __asm__(FLT_LOAD " f4,  %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v8, v8, f4");
+    __asm__(VSE " v8, (%0)": : "r"(b11));
+    __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 5) return;
+    
+    switch (m){
+        case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v0, f6, v8");
+        case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+                __asm__("vfnmsac.vf  v4, f5, v8");
+    }
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 5 from bottom
+    __asm__(FLT_LOAD " f5,  %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v4, v4, f5");
+    __asm__(VSE " v4, (%0)": : "r"(b11));
+    __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc));
+    if (m == 6) return;
+    
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+    __asm__("vfnmsac.vf v0, f6, v4");
+    
+    // Pointer bumps
+    __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE));
+    __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE));
+    __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc));
+    
+    // Row 6 from bottom
+    __asm__(FLT_LOAD " f6,  %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11));
+    __asm__("vfmul.vf v0, v0, f6");
+    __asm__(VSE " v0, (%0)": : "r"(b11));
+    __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc));
+    
+}
+#endif
diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h
new file mode 100644
index 0000000000..425c7dad92
--- /dev/null
+++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// Level 1
+ADDV_KER_PROT(float,        s, addv_sifive_x280_intr)
+ADDV_KER_PROT(double,       d, addv_sifive_x280_intr)
+ADDV_KER_PROT(scomplex,     c, addv_sifive_x280_intr)
+ADDV_KER_PROT(dcomplex,     z, addv_sifive_x280_intr)
+
+AMAXV_KER_PROT(float,       s, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(double,      d, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(scomplex,    c, amaxv_sifive_x280_asm)
+AMAXV_KER_PROT(dcomplex,    z, amaxv_sifive_x280_asm)
+
+AXPBYV_KER_PROT(float,      s, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(double,     d, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(scomplex,   c, axpbyv_sifive_x280_intr)
+AXPBYV_KER_PROT(dcomplex,   z, axpbyv_sifive_x280_intr)
+
+AXPYV_KER_PROT(float,       s, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(double,      d, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(scomplex,    c, axpyv_sifive_x280_intr)
+AXPYV_KER_PROT(dcomplex,    z, axpyv_sifive_x280_intr)
+
+COPYV_KER_PROT(float,       s, copyv_sifive_x280_asm)
+COPYV_KER_PROT(double,      d, copyv_sifive_x280_asm)
+COPYV_KER_PROT(scomplex,    c, copyv_sifive_x280_asm)
+COPYV_KER_PROT(dcomplex,    z, copyv_sifive_x280_asm)
+
+DOTV_KER_PROT(float,        s, dotv_sifive_x280_intr)
+DOTV_KER_PROT(double,       d, dotv_sifive_x280_intr)
+DOTV_KER_PROT(scomplex,     c, dotv_sifive_x280_intr)
+DOTV_KER_PROT(dcomplex,     z, dotv_sifive_x280_intr)
+
+DOTXV_KER_PROT(float,       s, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(double,      d, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(scomplex,    c, dotxv_sifive_x280_intr)
+DOTXV_KER_PROT(dcomplex,    z, dotxv_sifive_x280_intr)
+
+INVERTV_KER_PROT(float,     s, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(double,    d, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(scomplex,  c, invertv_sifive_x280_asm)
+INVERTV_KER_PROT(dcomplex,  z, invertv_sifive_x280_asm)
+
+INVSCALV_KER_PROT(float,    s, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(double,   d, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm)
+INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm)
+
+SCAL2V_KER_PROT(float,      s, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(double,     d, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(scomplex,   c, scal2v_sifive_x280_intr)
+SCAL2V_KER_PROT(dcomplex,   z, scal2v_sifive_x280_intr)
+
+SCALV_KER_PROT(float,       s, scalv_sifive_x280_intr)
+SCALV_KER_PROT(double,      d, scalv_sifive_x280_intr)
+SCALV_KER_PROT(scomplex,    c, scalv_sifive_x280_intr)
+SCALV_KER_PROT(dcomplex,    z, scalv_sifive_x280_intr)
+
+SETV_KER_PROT(float,        s, setv_sifive_x280_asm)
+SETV_KER_PROT(double,       d, setv_sifive_x280_asm)
+SETV_KER_PROT(scomplex,     c, setv_sifive_x280_asm)
+SETV_KER_PROT(dcomplex,     z, setv_sifive_x280_asm)
+
+SUBV_KER_PROT(float,        s, subv_sifive_x280_intr)
+SUBV_KER_PROT(double,       d, subv_sifive_x280_intr)
+SUBV_KER_PROT(scomplex,     c, subv_sifive_x280_intr)
+SUBV_KER_PROT(dcomplex,     z, subv_sifive_x280_intr)
+
+SWAPV_KER_PROT(float,       s, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(double,      d, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(scomplex,    c, swapv_sifive_x280_asm)
+SWAPV_KER_PROT(dcomplex,    z, swapv_sifive_x280_asm)
+
+XPBYV_KER_PROT(float,       s, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(double,      d, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(scomplex,    c, xpbyv_sifive_x280_intr)
+XPBYV_KER_PROT(dcomplex,    z, xpbyv_sifive_x280_intr)
+
+// Level 1f
+AXPY2V_KER_PROT(float,      s, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(double,     d, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(scomplex,   c, axpy2v_sifive_x280_intr)
+AXPY2V_KER_PROT(dcomplex,   z, axpy2v_sifive_x280_intr)
+
+AXPYF_KER_PROT(float,       s, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(double,      d, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(scomplex,    c, axpyf_sifive_x280_asm)
+AXPYF_KER_PROT(dcomplex,    z, axpyf_sifive_x280_asm)
+
+DOTXF_KER_PROT(float,       s, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(double,      d, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(scomplex,    c, dotxf_sifive_x280_asm)
+DOTXF_KER_PROT(dcomplex,    z, dotxf_sifive_x280_asm)
+
+DOTAXPYV_KER_PROT(float,    s, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(double,   d, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr)
+DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr)
+
+DOTXAXPYF_KER_PROT(float,   s, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(double,  d, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm)
+DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm)
+
+// Level 1m
+PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_7xk)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_6xk)
+PACKM_KER_PROT(float,       s, packm_sifive_x280_asm_64xk)
+PACKM_KER_PROT(double,      d, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(scomplex,    c, packm_sifive_x280_asm_32xk)
+PACKM_KER_PROT(dcomplex,    z, packm_sifive_x280_asm_16xk)
+
+// Level 3
+GEMM_UKR_PROT(float,        s, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(double,       d, gemm_sifive_x280_asm_7m4)
+GEMM_UKR_PROT(scomplex,     c, gemm_sifive_x280_asm_6m2)
+GEMM_UKR_PROT(dcomplex,     z, gemm_sifive_x280_asm_6m2)
+
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(float,    s, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(double,   d, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm)
+GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm)
diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h
new file mode 100644
index 0000000000..9c33fd7bc5
--- /dev/null
+++ b/kernels/sifive_x280/riscv_cmul_macros_asm.h
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// macros to emit complex multiplication
+// caveat: the destination registers cannot overlap the source registers!
+// rd = rs1 * rs2
+#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \
+  \
+  __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\
+  __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\
+  __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\
+  __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i)
+
+// vd = vs2 * f[rs1]
+#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd = conj(vs2) * f[rs1]
+#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \
+  \
+  __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\
+  __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += vs2 * f[rs1]
+#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\
+  __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\
+  __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r))
+
+// vd += conj(vs2) * f[rs1]
+#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd -= vs2 * f[rs1]
+#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \
+  \
+  __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\
+  __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\
+  __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i)
+
+// vd = vs2 * vs1
+#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd = vs2 * conj(vs1)
+#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * vs1
+#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
+// vd += vs2 * conj(vs1)
+#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \
+  \
+  __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\
+  __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\
+  __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r)
+
diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
new file mode 100644
index 0000000000..6a1d11b131
--- /dev/null
+++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, SiFive, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// 6. Configuration-Setting and Utility Functions
+#define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t
+#define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL)
+#define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t
+#define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS)
+#define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL
+#define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL)
+
+// 7. Vector Loads and Stores
+// Loads
+#define VLE_V_F_(PRECISION, LMUL)   __riscv_vle##PRECISION##_v_f##PRECISION##LMUL
+#define VLE_V_F(PRECISION, LMUL)   VLE_V_F_(PRECISION, LMUL)
+#define VLSE_V_F_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL
+#define VLSE_V_F(PRECISION, LMUL) VLSE_V_F_(PRECISION, LMUL)
+#define VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS)   VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+// Stores
+#define VSE_V_F_(PRECISION, LMUL)   __riscv_vse##PRECISION##_v_f##PRECISION##LMUL
+#define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL)
+#define VSSE_V_F_(PRECISION, LMUL) __riscv_vsse##PRECISION##_v_f##PRECISION##LMUL
+#define VSSE_V_F(PRECISION, LMUL) VSSE_V_F_(PRECISION, LMUL)
+#define VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)   __riscv_vsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS
+#define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS)
+
+// 13. Vector Floating-Point Operations
+#define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL
+#define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL)
+#define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL
+#define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL
+#define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL)
+#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL
+#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL)
+#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL
+#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL)
+#define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL
+#define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL)
+#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu
+#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL)
+#define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL
+#define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL
+#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL)
+#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu
+#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL)
+#define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL
+#define VFMADD_VF(PRECISION, LMUL)  VFMADD_VF_(PRECISION, LMUL)
+#define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL
+#define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL)
+#define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL
+#define VFNEG_VF(PRECISION, LMUL)  VFNEG_VF_(PRECISION, LMUL)
+#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)(  __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG
+#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL)
+
+// 14. Vector Reduction Operations
+#define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1
+#define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL)
+
+// 16. Vector Permutation Operations
+#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL
+#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL)
+#define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION
+#define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION)
+
+// Miscellaneous Vector Function
+#define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL
+#define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL)
+#define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL
+#define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL)
+#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL
+#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS)
+#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS
+#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS)
+
+// Non-vector functions
+#define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__))
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 13bb8ea4c7..56750edf57 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -135,15 +135,16 @@ void libblis_test_thread_decorator( test_params_t* params, test_ops_t* ops )
 	err_t r_val;
 
 #ifdef BLIS_ENABLE_HPX
+	size_t nt = ( size_t )params->n_app_threads;
 
-	size_t tdata_size = ( size_t )params->n_app_threads *
+	size_t tdata_size = ( size_t )nt *
 	                    ( size_t )sizeof( thread_data_t );
 	thread_data_t* tdata = bli_malloc_user( tdata_size, &r_val );
 
 	tdata->params  = params;
 	tdata->ops     = ops;
 	tdata->nt      = nt;
-	tdata->id      = 1;
+	tdata->id      = 0;
 	tdata->xc      = 0;
 
 	// Walk through all test modules.
diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh
index a51d33061a..56c2b85c26 100755
--- a/travis/do_riscv.sh
+++ b/travis/do_riscv.sh
@@ -3,16 +3,19 @@
 set -e
 set -x
 
-TAG=2023.02.25
+TAG=2023.10.18
 
 # The prebuilt toolchains only support hardfloat, so we only
 # test these for now.
 case $1 in
 	"rv32iv")
-	TARBALL=riscv32-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	TARBALL=riscv32-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
 	;;
 	"rv64iv")
-	TARBALL=riscv64-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz
+	TARBALL=riscv64-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz
+	;;
+	"sifive_x280")
+	TARBALL=riscv64-glibc-ubuntu-20.04-llvm-nightly-${TAG}-nightly.tar.gz
 	;;
 	*)
 	exit 1