diff --git a/.travis.yml b/.travis.yml index 848cb18437..bdfafb6b0f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -86,6 +86,11 @@ matrix: env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="rv32iv" \ CC=riscv32-unknown-linux-gnu-gcc \ LDFLAGS=-static + - os: linux + compiler: clang + env: OOT=0 TEST=FAST SDE=0 THR="none" BLD="--disable-shared" CONF="sifive_x280" \ + CC=clang \ + LDFLAGS=-static install: - if [ "$CC" = "gcc" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then export CC="gcc-9"; fi - if [ -n "$PACKAGES" ] && [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y $PACKAGES; fi @@ -106,6 +111,12 @@ script: export CXX=$DIST_PATH/../toolchain/riscv/bin/riscv32-unknown-linux-gnu-g++; export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv32 -cpu rv32,vext_spec=v1.0,v=true,vlen=128 -B 0x100000"; fi +- if [ "$CONF" = "sifive_x280" ]; then + $DIST_PATH/travis/do_riscv.sh "$CONF"; + export CC=$DIST_PATH/../toolchain/riscv/bin/clang; + export CXX=$DIST_PATH/../toolchain/riscv/bin/clang++; + export TESTSUITE_WRAPPER="$DIST_PATH/../toolchain/qemu-riscv64 -cpu rv64,vext_spec=v1.0,v=true,vlen=512 -B 0x100000"; + fi - $DIST_PATH/configure -p `pwd`/../install -t $THR $BLD CC=$CC $CONF - pwd - ls -l diff --git a/CREDITS b/CREDITS index 99ae43bd6a..fa99b0572a 100644 --- a/CREDITS +++ b/CREDITS @@ -22,6 +22,7 @@ but many others have contributed code, ideas, and feedback, including Matthew Brett @matthew-brett (University of Birmingham) Jérémie du Boisberranger @jeremiedbb Jed Brown @jedbrown (Argonne National Laboratory) + Alex Chiang @alexsifivetw (SiFive) Robin Christ @robinchrist Dilyn Corner @dilyn-corner Mat Cross @matcross (NAG) @@ -54,6 +55,7 @@ but many others have contributed code, ideas, and feedback, including Minh Quan Ho @hominhquan Matthew Honnibal @honnibal Stefan Husmann @stefanhusmann + Aaron Hutchinson @Aaron-Hutchinson (SiFive) Francisco Igual @figual (Universidad Complutense de Madrid) John Mather @jmather-sesi (SideFX Software) Madeesh Kannan @shadeMe diff --git a/config/sifive_x280/bli_cntx_init_sifive_x280.c b/config/sifive_x280/bli_cntx_init_sifive_x280.c new file mode 100644 index 0000000000..197394c822 --- /dev/null +++ b/config/sifive_x280/bli_cntx_init_sifive_x280.c @@ -0,0 +1,226 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_sifive_x280( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_sifive_x280_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native kernels. + bli_cntx_set_ukrs + ( + cntx, + + // Level 1 + BLIS_ADDV_KER, BLIS_FLOAT, bli_saddv_sifive_x280_intr, + BLIS_ADDV_KER, BLIS_DOUBLE, bli_daddv_sifive_x280_intr, + BLIS_ADDV_KER, BLIS_SCOMPLEX, bli_caddv_sifive_x280_intr, + BLIS_ADDV_KER, BLIS_DCOMPLEX, bli_zaddv_sifive_x280_intr, + + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_sifive_x280_asm, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_sifive_x280_asm, + BLIS_AMAXV_KER, BLIS_SCOMPLEX, bli_camaxv_sifive_x280_asm, + BLIS_AMAXV_KER, BLIS_DCOMPLEX, bli_zamaxv_sifive_x280_asm, + + BLIS_AXPBYV_KER, BLIS_FLOAT, bli_saxpbyv_sifive_x280_intr, + BLIS_AXPBYV_KER, BLIS_DOUBLE, bli_daxpbyv_sifive_x280_intr, + BLIS_AXPBYV_KER, BLIS_SCOMPLEX, bli_caxpbyv_sifive_x280_intr, + BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_sifive_x280_intr, + + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_sifive_x280_intr, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_sifive_x280_intr, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_sifive_x280_intr, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_sifive_x280_intr, + + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_sifive_x280_asm, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_sifive_x280_asm, + BLIS_COPYV_KER, BLIS_SCOMPLEX, bli_ccopyv_sifive_x280_asm, + BLIS_COPYV_KER, BLIS_DCOMPLEX, bli_zcopyv_sifive_x280_asm, + + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_sifive_x280_intr, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_sifive_x280_intr, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_sifive_x280_intr, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_sifive_x280_intr, + + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_sifive_x280_intr, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_sifive_x280_intr, + BLIS_DOTXV_KER, BLIS_SCOMPLEX, bli_cdotxv_sifive_x280_intr, + BLIS_DOTXV_KER, BLIS_DCOMPLEX, bli_zdotxv_sifive_x280_intr, + + BLIS_INVERTV_KER, BLIS_FLOAT, bli_sinvertv_sifive_x280_asm, + BLIS_INVERTV_KER, BLIS_DOUBLE, bli_dinvertv_sifive_x280_asm, + BLIS_INVERTV_KER, BLIS_SCOMPLEX, bli_cinvertv_sifive_x280_asm, + BLIS_INVERTV_KER, BLIS_DCOMPLEX, bli_zinvertv_sifive_x280_asm, + + BLIS_INVSCALV_KER, BLIS_FLOAT, bli_sinvscalv_sifive_x280_asm, + BLIS_INVSCALV_KER, BLIS_DOUBLE, bli_dinvscalv_sifive_x280_asm, + BLIS_INVSCALV_KER, BLIS_SCOMPLEX, bli_cinvscalv_sifive_x280_asm, + BLIS_INVSCALV_KER, BLIS_DCOMPLEX, bli_zinvscalv_sifive_x280_asm, + + BLIS_SCAL2V_KER, BLIS_FLOAT, bli_sscal2v_sifive_x280_intr, + BLIS_SCAL2V_KER, BLIS_DOUBLE, bli_dscal2v_sifive_x280_intr, + BLIS_SCAL2V_KER, BLIS_SCOMPLEX, bli_cscal2v_sifive_x280_intr, + BLIS_SCAL2V_KER, BLIS_DCOMPLEX, bli_zscal2v_sifive_x280_intr, + + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_sifive_x280_intr, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_sifive_x280_intr, + BLIS_SCALV_KER, BLIS_SCOMPLEX, bli_cscalv_sifive_x280_intr, + BLIS_SCALV_KER, BLIS_DCOMPLEX, bli_zscalv_sifive_x280_intr, + + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_sifive_x280_asm, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_sifive_x280_asm, + BLIS_SETV_KER, BLIS_SCOMPLEX, bli_csetv_sifive_x280_asm, + BLIS_SETV_KER, BLIS_DCOMPLEX, bli_zsetv_sifive_x280_asm, + + BLIS_SUBV_KER, BLIS_FLOAT, bli_ssubv_sifive_x280_intr, + BLIS_SUBV_KER, BLIS_DOUBLE, bli_dsubv_sifive_x280_intr, + BLIS_SUBV_KER, BLIS_SCOMPLEX, bli_csubv_sifive_x280_intr, + BLIS_SUBV_KER, BLIS_DCOMPLEX, bli_zsubv_sifive_x280_intr, + + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_sifive_x280_asm, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_sifive_x280_asm, + BLIS_SWAPV_KER, BLIS_SCOMPLEX, bli_cswapv_sifive_x280_asm, + BLIS_SWAPV_KER, BLIS_DCOMPLEX, bli_zswapv_sifive_x280_asm, + + BLIS_XPBYV_KER, BLIS_FLOAT, bli_sxpbyv_sifive_x280_intr, + BLIS_XPBYV_KER, BLIS_DOUBLE, bli_dxpbyv_sifive_x280_intr, + BLIS_XPBYV_KER, BLIS_SCOMPLEX, bli_cxpbyv_sifive_x280_intr, + BLIS_XPBYV_KER, BLIS_DCOMPLEX, bli_zxpbyv_sifive_x280_intr, + + // Level 1f + BLIS_AXPY2V_KER, BLIS_FLOAT, bli_saxpy2v_sifive_x280_intr, + BLIS_AXPY2V_KER, BLIS_DOUBLE, bli_daxpy2v_sifive_x280_intr, + BLIS_AXPY2V_KER, BLIS_SCOMPLEX, bli_caxpy2v_sifive_x280_intr, + BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_sifive_x280_intr, + + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_sifive_x280_asm, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_sifive_x280_asm, + BLIS_AXPYF_KER, BLIS_SCOMPLEX, bli_caxpyf_sifive_x280_asm, + BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_sifive_x280_asm, + + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_sifive_x280_asm, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_sifive_x280_asm, + BLIS_DOTXF_KER, BLIS_SCOMPLEX, bli_cdotxf_sifive_x280_asm, + BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_sifive_x280_asm, + + BLIS_DOTAXPYV_KER, BLIS_FLOAT, bli_sdotaxpyv_sifive_x280_intr, + BLIS_DOTAXPYV_KER, BLIS_DOUBLE, bli_ddotaxpyv_sifive_x280_intr, + BLIS_DOTAXPYV_KER, BLIS_SCOMPLEX, bli_cdotaxpyv_sifive_x280_intr, + BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_sifive_x280_intr, + + BLIS_DOTXAXPYF_KER, BLIS_FLOAT, bli_sdotxaxpyf_sifive_x280_asm, + BLIS_DOTXAXPYF_KER, BLIS_DOUBLE, bli_ddotxaxpyf_sifive_x280_asm, + BLIS_DOTXAXPYF_KER, BLIS_SCOMPLEX, bli_cdotxaxpyf_sifive_x280_asm, + BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_sifive_x280_asm, + + // Level 1m + BLIS_PACKM_MRXK_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_7xk, + BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_7xk, + BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_6xk, + BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_6xk, + BLIS_PACKM_NRXK_KER, BLIS_FLOAT, bli_spackm_sifive_x280_asm_64xk, + BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_sifive_x280_asm_32xk, + BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_sifive_x280_asm_32xk, + BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_sifive_x280_asm_16xk, + + // Level 3 + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sifive_x280_asm_7m4, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sifive_x280_asm_7m4, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sifive_x280_asm_6m2, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sifive_x280_asm_6m2, + + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_sifive_x280_asm, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_sifive_x280_asm, + BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_l_sifive_x280_asm, + BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_sifive_x280_asm, + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_sifive_x280_asm, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_sifive_x280_asm, + BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsm_u_sifive_x280_asm, + BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_sifive_x280_asm, + + BLIS_VA_END + ); + + // Update the context with storage preferences. + bli_cntx_set_ukr_prefs + ( + cntx, + + BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE, + BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE, + + BLIS_VA_END + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 7, 7, 6, 6, + 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 64, 32, 32, 16 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 28, 28, 24, 24 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 1024, 1024, 1024, 1024 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 128, 256, 128 ); + // Default BLIS_BBM_s = 1, but set here to ensure it's correct + bli_blksz_init_easy( &blkszs[ BLIS_BBM ], 1, 1, 1, 1 ); + bli_blksz_init_easy( &blkszs[ BLIS_BBN ], 1, 1, 1, 1 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + cntx, + + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + + // level-1m + BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM, + BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN, + + BLIS_VA_END + ); +} + diff --git a/config/sifive_x280/bli_family_sifive_x280.h b/config/sifive_x280/bli_family_sifive_x280.h new file mode 100644 index 0000000000..4f02c048fa --- /dev/null +++ b/config/sifive_x280/bli_family_sifive_x280.h @@ -0,0 +1,34 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + diff --git a/config/sifive_x280/bli_kernel_defs_sifive_x280.h b/config/sifive_x280/bli_kernel_defs_sifive_x280.h new file mode 100644 index 0000000000..bb6865a669 --- /dev/null +++ b/config/sifive_x280/bli_kernel_defs_sifive_x280.h @@ -0,0 +1,55 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_KERNEL_DEFS_H +//#define BLIS_KERNEL_DEFS_H + + +// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ---------------------------- +#define BLIS_MR_s 7 +#define BLIS_MR_d 7 +#define BLIS_MR_c 6 +#define BLIS_MR_z 6 + +#define BLIS_PACKMR_s 8 +#define BLIS_PACKMR_d 8 +#define BLIS_PACKMR_c 8 +#define BLIS_PACKMR_z 8 + +#define BLIS_NR_s 64 +#define BLIS_NR_d 32 +#define BLIS_NR_c 32 +#define BLIS_NR_z 16 +//#endif + diff --git a/config/sifive_x280/make_defs.mk b/config/sifive_x280/make_defs.mk new file mode 100644 index 0000000000..acdf5a3611 --- /dev/null +++ b/config/sifive_x280/make_defs.mk @@ -0,0 +1,78 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2023, SiFive, Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name(s) of the copyright holder(s) nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := sifive_x280 +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + + +# NOTE: The build system will append these variables with various +# general-purpose/configuration-agnostic flags in common.mk. You +# may specify additional flags here as needed. +CMISCFLAGS_SIFIVE := -mcmodel=medany -march=rv64gcv_zba_zbb_zvl512b -mabi=lp64d +CPPROCFLAGS := +CMISCFLAGS := $(CMISCFLAGS_SIFIVE) -fdata-sections -ffunction-sections \ + -fdiagnostics-color=always -fno-rtti -fno-exceptions +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wextra -Wno-unused-function -Wno-unused-parameter \ + -Wno-sign-compare -Wno-unused-variable + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 +else +COPTFLAGS := -Ofast +endif + +# Flags specific to optimized kernels. +CKOPTFLAGS := $(COPTFLAGS) +CKVECFLAGS := + +# Flags specific to reference kernels. +CROPTFLAGS := $(CKOPTFLAGS) +CRVECFLAGS := $(CKVECFLAGS) + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config_registry b/config_registry index 44bb069c91..8c1f6f2542 100644 --- a/config_registry +++ b/config_registry @@ -61,5 +61,8 @@ rv64i: rv64i/rvi rv32iv: rv32iv/rviv rv64iv: rv64iv/rviv +# SiFive architectures. +sifive_x280: sifive_x280 + # Generic architectures. generic: generic diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index 111b27e208..a8061f9333 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -286,6 +286,11 @@ arch_t bli_arch_query_id_impl( void ) id = BLIS_ARCH_RV64IV; #endif + // SiFive microarchitectures. + #ifdef BLIS_FAMILY_SIFIVE_X280 + id = BLIS_ARCH_SIFIVE_X280; + #endif + // Generic microarchitecture. #ifdef BLIS_FAMILY_GENERIC id = BLIS_ARCH_GENERIC; @@ -351,6 +356,8 @@ static const char* config_name[ BLIS_NUM_ARCHS ] = "rv32iv", "rv64iv", + "sifive_x280", + "generic" }; diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 7b9ab3d7c2..a21aa12446 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -259,6 +259,14 @@ int bli_gks_init( void ) bli_cntx_init_rv64iv_ind ); #endif + // -- SiFive architectures ---------------------------------------------- + +#ifdef BLIS_CONFIG_SIFIVE_X280 + bli_gks_register_cntx( BLIS_ARCH_SIFIVE_X280, bli_cntx_init_sifive_x280, + bli_cntx_init_sifive_x280_ref, + bli_cntx_init_sifive_x280_ind ); +#endif + // -- Generic architectures -------------------------------------------- #ifdef BLIS_CONFIG_GENERIC diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index f8e18c5c10..361e9663d2 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -156,6 +156,12 @@ CNTX_INIT_PROTS( rv32iv ) CNTX_INIT_PROTS( rv64iv ) #endif +// -- SiFive architectures -- + +#ifdef BLIS_CONFIG_SIFIVE_X280 +CNTX_INIT_PROTS( sifive_x280 ) +#endif + // -- Generic -- #ifdef BLIS_CONFIG_GENERIC @@ -295,6 +301,12 @@ CNTX_INIT_PROTS( generic ) #include "bli_family_bgq.h" #endif +// -- SiFive families -- + +#ifdef BLIS_FAMILY_SIFIVE_X280 +#include "bli_family_sifive_x280.h" +#endif + // -- Generic -- #ifdef BLIS_FAMILY_GENERIC @@ -386,6 +398,12 @@ CNTX_INIT_PROTS( generic ) #include "bli_kernels_rviv.h" #endif +// -- SiFive RISC-V architectures -- + +#ifdef BLIS_KERNELS_SIFIVE_X280 +#include "bli_kernels_sifive_x280.h" +#endif + #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 60c55a5ed1..2f81a4749a 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -975,6 +975,9 @@ typedef enum BLIS_ARCH_RV32IV, BLIS_ARCH_RV64IV, + // SiFive + BLIS_ARCH_SIFIVE_X280, + // Generic architecture/configuration BLIS_ARCH_GENERIC, diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c new file mode 100644 index 0000000000..2b7ad6fe7d --- /dev/null +++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr.c @@ -0,0 +1,118 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define ADDV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##addv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define ADDV(...) ADDV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_addv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_addv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_addv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_addv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef ADDV +#undef ADDV_ diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..d5343befe0 --- /dev/null +++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_complex.c @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef ADDV + +ADDV(PRECISION_CHAR, void) +{ + // Computes y := y + conjx(x) + (void) cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + yvec_real = VFADD_VV(PREC, LMUL)(yvec_real, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE) + yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + else + yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + +} + +#endif // ADDV diff --git a/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..d4e7d4a45e --- /dev/null +++ b/kernels/sifive_x280/1/bli_addv_sifive_x280_intr/bli_addv_sifive_x280_intr_real.c @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef ADDV + +ADDV(PRECISION_CHAR, void) +{ + // Computes y = y + conjx(x) + // == y + x (real case) + + (void) cntx; + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFADD_VV(PREC, LMUL)(yvec, xvec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL) (y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } +} + +#endif // ADDV diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c new file mode 100644 index 0000000000..c423dd131d --- /dev/null +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c @@ -0,0 +1,293 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, + dim_t *index, const cntx_t *cntx) { + // assumes 64-bit index + (void)cntx; + const float* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + incx *= 4; + size_t avl = n; + size_t offset = 0; + bool first = true; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" + : "=r"(vl) + : "r"(avl)); + if (incx == 4) + __asm__("vle32.v v24, (%0)" : : "r"(x)); + else + __asm__("vlse32.v v24, (%0), %1" : : "r"(x), "r"(incx)); + // check for NaN + __asm__ volatile("vmfne.vv v0, v24, v24"); + dim_t nan_index; + __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); + if (nan_index != -1) { + *index = nan_index + offset; + return; + } + if (first) { + __asm__("vfabs.v v8, v24"); + // keep vl same, change SEW and LMUL + __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); + __asm__("vid.v v16"); + first = false; + } else { + __asm__("vfabs.v v24, v24"); + __asm__("vmflt.vv v0, v8, v24"); + __asm__("vmerge.vvm v8, v8, v24, v0"); + // keep vl same, change SEW and LMUL + __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); + __asm__("vid.v v24"); + __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); + __asm__("vmerge.vvm v16, v16, v24, v0"); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + offset += vl; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); + __asm__("vmv.s.x v0, zero"); + __asm__("vfredmax.vs v0, v8, v0"); + __asm__("vrgather.vi v24, v0, 0"); + __asm__("vmfeq.vv v0, v8, v24"); + __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); + uint64_t imax = -1; + __asm__("vmv.s.x v24, %0" : : "r"(imax)); + __asm__("vredminu.vs v24, v16, v24, v0.t"); + __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); + __asm__("vse64.v v24, (%0)" : : "r"(index)); + return; +} + +void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, + dim_t *index, const cntx_t *cntx) { + // assumes 64-bit index + (void)cntx; + const double* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + incx *= 8; + size_t avl = n; + size_t offset = 0; + bool first = true; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e64, m8, tu, ma" + : "=r"(vl) + : "r"(avl)); + if (incx == 8) + __asm__("vle64.v v24, (%0)" : : "r"(x)); + else + __asm__("vlse64.v v24, (%0), %1" : : "r"(x), "r"(incx)); + // check for NaN + __asm__ volatile("vmfne.vv v0, v24, v24"); + dim_t nan_index; + __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); + if (nan_index != -1) { + *index = nan_index + offset; + return; + } + if (first) { + __asm__("vfabs.v v8, v24"); + __asm__("vid.v v16"); + first = false; + } else { + __asm__("vfabs.v v24, v24"); + __asm__("vmflt.vv v0, v8, v24"); + __asm__("vmerge.vvm v8, v8, v24, v0"); + __asm__("vid.v v24"); + __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); + __asm__("vmerge.vvm v16, v16, v24, v0"); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + offset += vl; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e64, m8, ta, ma" : : "r"(n)); + __asm__("vmv.s.x v0, zero"); + __asm__("vfredmax.vs v0, v8, v0"); + __asm__("vrgather.vi v24, v0, 0"); + __asm__("vmfeq.vv v0, v8, v24"); + uint64_t imax = -1; + __asm__("vmv.s.x v24, %0" : : "r"(imax)); + __asm__("vredminu.vs v24, v16, v24, v0.t"); + __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); + __asm__("vse64.v v24, (%0)" : : "r"(index)); + return; +} + +void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, + dim_t *index, const cntx_t *cntx) { + // assumes 64-bit index + (void)cntx; + const scomplex* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + incx *= 8; + size_t avl = n; + size_t offset = 0; + bool first = true; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e32, m4, tu, ma" + : "=r"(vl) + : "r"(avl)); + if (incx == 8) + __asm__("vlseg2e32.v v24, (%0)" : : "r"(x)); + else + __asm__("vlsseg2e32.v v24, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfabs.v v24, v24"); + __asm__("vfabs.v v28, v28"); + __asm__("vfadd.vv v24, v24, v28"); + // check for NaN + __asm__ volatile("vmfne.vv v0, v24, v24"); + dim_t nan_index; + __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); + if (nan_index != -1) { + *index = nan_index + offset; + return; + } + if (first) { + __asm__("vmv4r.v v8, v24"); + // keep vl same, change SEW and LMUL + __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); + __asm__("vid.v v16"); + first = false; + } else { + __asm__("vmflt.vv v0, v8, v24"); + __asm__("vmerge.vvm v8, v8, v24, v0"); + // keep vl same, change SEW and LMUL + __asm__ volatile("vsetvli zero, zero, e64, m8, tu, ma"); + __asm__("vid.v v24"); + __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); + __asm__("vmerge.vvm v16, v16, v24, v0"); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + offset += vl; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e32, m4, ta, ma" : : "r"(n)); + __asm__("vmv.s.x v0, zero"); + __asm__("vfredmax.vs v0, v8, v0"); + __asm__("vrgather.vi v24, v0, 0"); + __asm__("vmfeq.vv v0, v8, v24"); + __asm__ volatile("vsetvli zero, zero, e64, m8, ta, ma"); + uint64_t imax = -1; + __asm__("vmv.s.x v24, %0" : : "r"(imax)); + __asm__("vredminu.vs v24, v16, v24, v0.t"); + __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); + __asm__("vse64.v v24, (%0)" : : "r"(index)); + return; +} + +void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, + dim_t *index, const cntx_t *cntx) { + // assumes 64-bit index + (void)cntx; + const dcomplex* restrict x = x_; + + if (n <= 1) { + *index = 0; + return; + } + incx *= 16; + size_t avl = n; + size_t offset = 0; + bool first = true; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e64, m4, tu, ma" + : "=r"(vl) + : "r"(avl)); + if (incx == 16) + __asm__("vlseg2e64.v v24, (%0)" : : "r"(x)); + else + __asm__("vlsseg2e64.v v24, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfabs.v v24, v24"); + __asm__("vfabs.v v28, v28"); + __asm__("vfadd.vv v24, v24, v28"); + // check for NaN + __asm__ volatile("vmfne.vv v0, v24, v24"); + dim_t nan_index; + __asm__ volatile("vfirst.m %0, v0" : "=r"(nan_index)); + if (nan_index != -1) { + *index = nan_index + offset; + return; + } + if (first) { + __asm__("vmv4r.v v8, v24"); + __asm__("vid.v v16"); + first = false; + } else { + __asm__("vmflt.vv v0, v8, v24"); + __asm__("vmerge.vvm v8, v8, v24, v0"); + __asm__("vid.v v24"); + __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); + __asm__("vmerge.vvm v16, v16, v24, v0"); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + offset += vl; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(n)); + __asm__("vmv.s.x v0, zero"); + __asm__("vfredmax.vs v0, v8, v0"); + __asm__("vrgather.vi v24, v0, 0"); + __asm__("vmfeq.vv v0, v8, v24"); + uint64_t imax = -1; + __asm__("vmv.s.x v24, %0" : : "r"(imax)); + __asm__("vredminu.vs v24, v16, v24, v0.t"); + __asm__ volatile("vsetivli zero, 1, e64, m1, ta, ma"); + __asm__("vse64.v v24, (%0)" : : "r"(index)); + return; +} diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c new file mode 100644 index 0000000000..3b29f898df --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr.c @@ -0,0 +1,129 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define AXPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpbyv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict alpha_, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define AXPBYV(...) AXPBYV_(__VA_ARGS__) + +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) +#define SCALV_(PRECISION_CHAR) bli_##PRECISION_CHAR##scalv_sifive_x280_intr +#define SCALV(PRECISION_CHAR) SCALV_(PRECISION_CHAR) +#define SCAL2V_(PRECISION_CHAR) bli_##PRECISION_CHAR##scal2v_sifive_x280_intr +#define SCAL2V(PRECISION_CHAR) SCAL2V_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpbyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpbyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpbyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpbyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef AXPBYV +#undef AXPBYV_ diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..31fc584b97 --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_complex.c @@ -0,0 +1,121 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPBYV + +AXPBYV(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjx(x) + + if (n <= 0) return; + + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict beta = beta_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (alpha->real == 0 && alpha->imag == 0 && beta->real == 0 && beta->imag == 0){ + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx); + return; + } + if (alpha->real == 0 && alpha->imag == 0){ + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx); + return; + } + if (beta->real == 0 && beta->imag == 0){ + SCAL2V(PRECISION_CHAR)(conjx, n, alpha, x, incx, y, incy, cntx); + return; + } + + // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we + // will canonicalize NaNs whereas the reference code will propagate NaN payloads. + + // TO DO (optimization): special cases for alpha = +-1, +-i, beta = +-1, +-i + + // alpha and beta are both nonzero + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, temp_real, temp_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + // Computed as: + // y.real = beta.real * y.real - beta.imag * y.imag + alpha.real * x.real - alpha.imag * conj(x.imag) + // y.imag = beta.real * y.imag + beta.imag * y.real + alpha.imag * x.real + alpha.real * conj(x.imag) + temp_real = VFMUL_VF(PREC, LMUL) (yvec_real, beta->real, vl); + temp_imag = VFMUL_VF(PREC, LMUL) (yvec_imag, beta->real, vl); + temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, beta->imag, yvec_imag, vl); + temp_imag = VFMACC_VF(PREC, LMUL) (temp_imag, beta->imag, yvec_real, vl); + yvec_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->real, xvec_real, vl); + yvec_imag = VFMACC_VF(PREC, LMUL) (temp_imag, alpha->imag, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE) { + yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl); + } else { + yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); + } + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + +} + +#endif // AXPBYV diff --git a/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..33eafc5d12 --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpbyv_sifive_x280_intr/bli_axpbyv_sifive_x280_intr_real.c @@ -0,0 +1,98 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPBYV + +AXPBYV(PRECISION_CHAR, void) +{ + // Computes y := beta * y + alpha * conjx(x) + // == beta * y + alpha * x (real case) + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict beta = beta_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + if (*alpha == 0 && *beta == 0){ + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx); + return; + } + if (*alpha == 0){ + SCALV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, beta, y, incy, cntx); + return; + } + if (*beta == 0){ + SCAL2V(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, y, incy, cntx); + return; + } + + // Note: in the cases alpha = 0 && beta = 1, or alpha = 1 && beta = 0, we + // will canonicalize NaNs whereas the reference code will propagate NaN payloads. + + // TO DO (optimization): special cases for alpha = +-1, beta = +-1 + + // alpha and beta are both nonzero + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFMUL_VF(PREC, LMUL) (yvec, *beta, vl); + yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL)(y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } +} + +#endif // AXPYBV diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c new file mode 100644 index 0000000000..3f9ebd3b04 --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr.c @@ -0,0 +1,119 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define AXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpyv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict alpha_, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define AXPYV(...) AXPYV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef AXPYV +#undef AXPYV_ diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..dc520d2125 --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_complex.c @@ -0,0 +1,94 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYV + +AXPYV(PRECISION_CHAR, void) +{ + // Computes y := y + alpha * conjx(x) + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + if (alpha->real == 0 && alpha->imag == 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + yvec_real = VFMACC_VF(PREC, LMUL)( yvec_real, alpha->real, xvec_real, vl); + yvec_imag = VFMACC_VF(PREC, LMUL)( yvec_imag, alpha->imag, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE){ + yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFMACC_VF(PREC, LMUL) (yvec_imag, alpha->real, xvec_imag, vl); + } else { + yvec_real = VFMACC_VF(PREC, LMUL) (yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); + } + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + +} + +#endif // AXPYV diff --git a/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..0c2cda842f --- /dev/null +++ b/kernels/sifive_x280/1/bli_axpyv_sifive_x280_intr/bli_axpyv_sifive_x280_intr_real.c @@ -0,0 +1,79 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPYV + +AXPYV(PRECISION_CHAR, void) +{ + // Computes y = y + alpha * conj(x) + // == y + alpha * x (real case) + + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + if (*alpha == 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFMACC_VF(PREC, LMUL)(yvec, *alpha, xvec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL) (y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } +} + +#endif // AXPYV diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c new file mode 100644 index 0000000000..3571877759 --- /dev/null +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c @@ -0,0 +1,272 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)conjx; + (void)cntx; + const float* restrict x = x_; + float* restrict y = y_; + if (n <= 0) + return; + + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + + if (incy == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)conjx; + (void)cntx; + const double* restrict x = x_; + double* restrict y = y_; + if (n <= 0) + return; + + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + + if (incy == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " + +void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)cntx; + const scomplex* restrict x = x_; + scomplex* restrict y = y_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + if (conjx == BLIS_NO_CONJUGATE) { + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * 2 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + + if (incy == 2 * FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + } else { + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + else + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + + __asm__("vfneg.v v4, v4"); + + if (incy == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 + +#define FLT_SIZE 8 +#define SH_ADD "sh3add " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " + +void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)cntx; + const dcomplex* restrict x = x_; + dcomplex* restrict y = y_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + if (conjx == BLIS_NO_CONJUGATE && incx == 2 * FLT_SIZE && + incy == 2 * FLT_SIZE) { + size_t avl = 2 * n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + __asm__(VLE "v0, (%0)" : : "r"(x)); + __asm__(VSE "v0, (%0)" : : "r"(y)); + __asm__(SH_ADD "%0, %1, %0" : "+r"(x) : "r"(vl)); + __asm__(SH_ADD "%0, %1, %0" : "+r"(y) : "r"(vl)); + avl -= vl; + } + } else { + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + else + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + + if (conjx == BLIS_CONJUGATE) + __asm__("vfneg.v v4, v4"); + + if (incy == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + } + return; +} diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c new file mode 100644 index 0000000000..0dc8565400 --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr.c @@ -0,0 +1,120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define DOTV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotv_sifive_x280_intr(\ + conj_t conjxt, \ + conj_t conjy, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + const T* restrict y_, inc_t incy, \ + T* restrict rho_, \ + const cntx_t* cntx \ +) + +#define DOTV(...) DOTV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef DOTV +#undef DOTV_ diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..250fab46e6 --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_complex.c @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTV + +DOTV(PRECISION_CHAR, void) +{ + // Computes rho = conjxt(x)^T * conjy(y) + (void) cntx; + DATATYPE* restrict rho = rho_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + + if (n <= 0) { + rho->real = 0; + rho->imag = 0; + return; + } + + // Instead of conjugating x, switch conjugation on y + // and conjugate rho at the end + conj_t conjrho = conjxt; + if (conjxt == BLIS_CONJUGATE) + bli_toggle_conj(&conjy); // Switch conjugation of y + + RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag; + size_t avl = n; + bool first = true; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + if (first) { + acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl); + acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl); + first = false; + } else { + acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl); + acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl); + } + if (conjy == BLIS_NO_CONJUGATE) { + acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl); + acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl); + } else { + acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl); + acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl); + } + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + + + RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1); + RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1); + sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n); + sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n); + + if (conjrho == BLIS_CONJUGATE) { + sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1); + } + rho->real = VFMV_F_S(PREC)(sum_real); + rho->imag = VFMV_F_S(PREC)(sum_imag); + +} + +#endif // DOTV diff --git a/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..0ec8e6328a --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotv_sifive_x280_intr/bli_dotv_sifive_x280_intr_real.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTV + +DOTV(PRECISION_CHAR, void) +{ + // Computes rho = conjxt(x)^T * conjy(y) + // == x^T * y (real case) + (void) cntx; + (void) conjxt; // Suppress unused parameter warnings + (void) conjy; + DATATYPE* restrict rho = rho_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + + if (n <= 0) { + *rho = 0; + return; + } + + RVV_TYPE_F(PREC, LMUL) acc; + size_t avl = n; + bool first = true; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (first) { + acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl); + first = false; + } else + acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + + RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1); + sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n); + *rho = VFMV_F_S(PREC)(sum); +} + +#endif // DOTV diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c new file mode 100644 index 0000000000..048f8d2983 --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr.c @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define DOTXV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotxv_sifive_x280_intr(\ + conj_t conjxt, \ + conj_t conjy, \ + dim_t n, \ + const T* restrict alpha_, \ + const T* restrict x_, inc_t incx, \ + const T* restrict y_, inc_t incy, \ + const T* restrict beta_, \ + T* restrict rho_, \ + const cntx_t* cntx \ +) + +#define DOTXV(...) DOTXV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) +#define FMA fmaf + +#include "./bli_dotxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef FMA + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) +#define FMA fma + +#include "./bli_dotxv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef FMA + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) +#define FMA fmaf + +#include "./bli_dotxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef FMA + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) +#define FMA fma + +#include "./bli_dotxv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE +#undef FMA + +#undef DOTXV +#undef DOTXV_ diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..8245e8e057 --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_complex.c @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXV + +DOTXV(PRECISION_CHAR, void) +{ + // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y) + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict rho = rho_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + + if (beta->real == 0 && beta->imag == 0){ + rho->real = 0; + rho->imag = 0; + } else if (!(beta->real == 1 && beta->imag == 0)) { + DATATYPE temp = *rho; + rho->real = rho->real * beta->real - rho->imag * beta->imag; + rho->imag = temp.real * beta->imag + rho->imag * beta->real; + } + + if (n <= 0 || (alpha->real == 0 && alpha->imag == 0)) + return; + + // Instead of conjugating x, switch conjugation on y + // and conjugate dot product at the end + conj_t conjsum = conjxt; + if (conjxt == BLIS_CONJUGATE) + bli_toggle_conj(&conjy); // Switch conjugation of y + + // Compute dot product + RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag; + size_t avl = n; + bool first = true; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + if (first) { + acc_real = VFMUL_VV(PREC, LMUL)(xvec_real, yvec_real, vl); + acc_imag = VFMUL_VV(PREC, LMUL)(xvec_imag, yvec_real, vl); + first = false; + } else { + acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_real, yvec_real, vl); + acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_imag, yvec_real, vl); + } + if (conjy == BLIS_NO_CONJUGATE) { + acc_real = VFNMSAC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl); + acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_real, yvec_imag, vl); + } else { + acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_imag, yvec_imag, vl); + acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl); + } + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + + + RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)(0.f, 1); + RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)(0.f, 1); + sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n); + sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n); + + if (conjsum == BLIS_CONJUGATE) { + sum_imag = VFNEG_VF(PREC, m1)(sum_imag, 1); + } + DATATYPE dot = {VFMV_F_S(PREC)(sum_real), VFMV_F_S(PREC)(sum_imag)}; + + // Accumulate alpha * dot + rho->real = fma( alpha->real, dot.real, rho->real); + rho->real = fma(-alpha->imag, dot.imag, rho->real); + rho->imag = fma( alpha->imag, dot.real, rho->imag); + rho->imag = fma( alpha->real, dot.imag, rho->imag); + +} + +#endif // DOTXV diff --git a/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..f9d9346973 --- /dev/null +++ b/kernels/sifive_x280/1/bli_dotxv_sifive_x280_intr/bli_dotxv_sifive_x280_intr_real.c @@ -0,0 +1,94 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTXV + +DOTXV(PRECISION_CHAR, void) +{ + // Computes rho = beta * rho + alpha * conjxt(x)^T * conjy(y) + // == beta * rho + alpha * x^T * y (real case) + + (void) cntx; + (void) conjxt; // Suppress unused parameter warnings + (void) conjy; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict beta = beta_; + DATATYPE* restrict rho = rho_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + + if (*beta == 0) + *rho = 0; + else if (*beta != 1.0f) + *rho *= *beta; + + if (n <= 0 || *alpha == 0) + return; + + // Compute dot product + RVV_TYPE_F(PREC, LMUL) acc; + size_t avl = n; + bool first = true; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (first) { + acc = VFMUL_VV(PREC, LMUL)(xvec, yvec, vl); + first = false; + } else + acc = VFMACC_VV_TU(PREC, LMUL)(acc, xvec, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } + + RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)(0.f, 1); + sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n); + *rho = fma(*alpha, VFMV_F_S(PREC)(sum), *rho); +} + +#endif // DOTXV diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c new file mode 100644 index 0000000000..cbca885929 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c @@ -0,0 +1,221 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + float* restrict x = x_; + if (n <= 0) + return; + + float one = 1.f; + __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); + incx *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(x)); + __asm__("vfrdiv.vf v0, v0, f0"); + __asm__(VSE "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfrdiv.vf v0, v0, f0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + double* restrict x = x_; + if (n <= 0) + return; + + double one = 1.; + __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); + incx *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(x)); + __asm__("vfrdiv.vf v0, v0, f0"); + __asm__(VSE "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfrdiv.vf v0, v0, f0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " + +void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + scomplex* restrict x = x_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + __asm__("vfneg.v v4, v4"); + __asm__("vfmul.vv v8, v0, v0"); + __asm__("vfmacc.vv v8, v4, v4"); + __asm__("vfdiv.vv v0, v0, v8"); + __asm__("vfdiv.vv v4, v4, v8"); + __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfneg.v v4, v4"); + __asm__("vfmul.vv v8, v0, v0"); + __asm__("vfmacc.vv v8, v4, v4"); + __asm__("vfdiv.vv v0, v0, v8"); + __asm__("vfdiv.vv v4, v4, v8"); + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 + +#define FLT_SIZE 8 +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " + +void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + dcomplex* restrict x = x_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + __asm__("vfneg.v v4, v4"); + __asm__("vfmul.vv v8, v0, v0"); + __asm__("vfmacc.vv v8, v4, v4"); + __asm__("vfdiv.vv v0, v0, v8"); + __asm__("vfdiv.vv v4, v4, v8"); + __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfneg.v v4, v4"); + __asm__("vfmul.vv v8, v0, v0"); + __asm__("vfmacc.vv v8, v4, v4"); + __asm__("vfdiv.vv v0, v0, v8"); + __asm__("vfdiv.vv v4, v4, v8"); + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c new file mode 100644 index 0000000000..51edc92214 --- /dev/null +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c @@ -0,0 +1,266 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FDIV "fdiv.s " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)conjalpha; + (void)cntx; + const float* restrict alpha = alpha_; + float* restrict x = x_; + if (n <= 0 || *alpha == 0.f || *alpha == 1.f) + return; + + float one = 1.f; + __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); + __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); + __asm__(FDIV "f0, f0, f1"); + incx *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(x)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__(VSE "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FDIV +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FDIV "fdiv.d " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)conjalpha; + (void)cntx; + const double* restrict alpha = alpha_; + double* restrict x = x_; + if (n <= 0 || *alpha == 0. || *alpha == 1.) + return; + + double one = 1.; + __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); + __asm__(FLT_LOAD "f1, (%0)" : : "r"(alpha)); + __asm__(FDIV "f0, f0, f1"); + incx *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(x)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__(VSE "v0, (%0)" : : "r"(x)); + } else { + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FDIV +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FMUL "fmul.s " +#define FMADD "fmadd.s " +#define FDIV "fdiv.s " +#define FNEG "fneg.s " +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " + +void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + const scomplex* restrict alpha = alpha_; + scomplex* restrict x = x_; + if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f)) + return; + + __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FMUL "f2, f0, f0"); + __asm__(FMADD "f2, f1, f1, f2"); + __asm__(FDIV "f0, f0, f2"); + __asm__(FDIV "f1, f1, f2"); + if (conjalpha == BLIS_NO_CONJUGATE) + __asm__(FNEG "f1, f1"); + incx *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + __asm__("vfmul.vf v8, v0, f0"); + __asm__("vfmul.vf v12, v4, f0"); + __asm__("vfnmsac.vf v8, f1, v4"); + __asm__("vfmacc.vf v12, f1, v0"); + __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); + } else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfmul.vf v8, v0, f0"); + __asm__("vfmul.vf v12, v4, f0"); + __asm__("vfnmsac.vf v8, f1, v4"); + __asm__("vfmacc.vf v12, f1, v0"); + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef FMADD +#undef FDIV +#undef FNEG +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FMUL "fmul.d " +#define FMADD "fmadd.d " +#define FDIV "fdiv.d " +#define FNEG "fneg.d " +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " + +void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, + const cntx_t *cntx) { + (void)cntx; + const dcomplex* restrict alpha = alpha_; + dcomplex* restrict x = x_; + if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.)) + return; + + __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FMUL "f2, f0, f0"); + __asm__(FMADD "f2, f1, f1, f2"); + __asm__(FDIV "f0, f0, f2"); + __asm__(FDIV "f1, f1, f2"); + if (conjalpha == BLIS_NO_CONJUGATE) + __asm__(FNEG "f1, f1"); + incx *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + __asm__("vfmul.vf v8, v0, f0"); + __asm__("vfmul.vf v12, v4, f0"); + __asm__("vfnmsac.vf v8, f1, v4"); + __asm__("vfmacc.vf v12, f1, v0"); + __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); + } else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("vfmul.vf v8, v0, f0"); + __asm__("vfmul.vf v12, v4, f0"); + __asm__("vfnmsac.vf v8, f1, v4"); + __asm__("vfmacc.vf v12, f1, v0"); + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); + } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c new file mode 100644 index 0000000000..cd2dd2c188 --- /dev/null +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr.c @@ -0,0 +1,124 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define SCAL2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scal2v_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict alpha_, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define SCAL2V(...) SCAL2V_(__VA_ARGS__) + +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_scal2v_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_scal2v_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_scal2v_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_scal2v_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SCAL2V +#undef SCAL2V_ diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..4a25ce3e32 --- /dev/null +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_complex.c @@ -0,0 +1,100 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SCAL2V + +SCAL2V(PRECISION_CHAR, void) +{ + // Computes y = alpha * conjx(x) + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + if (alpha->real == 0 && alpha->imag == 0) { + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx); + return; + } + + if (alpha->real == 1 && alpha->imag == 0) { + COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx); + return; + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + yvec_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl); + yvec_imag = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->imag, vl); + if (conjx == BLIS_NO_CONJUGATE) { + yvec_real = VFNMSAC_VF(PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFMACC_VF( PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); + } else { + yvec_real = VFMACC_VF( PREC, LMUL)(yvec_real, alpha->imag, xvec_imag, vl); + yvec_imag = VFNMSAC_VF(PREC, LMUL)(yvec_imag, alpha->real, xvec_imag, vl); + } + + // FIXME: remove the #pragmas and change the __riscv_vset_v_f intrinsics to use + // __riscv_vcreate_v_f once they become available in LLVM. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wuninitialized" + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + #pragma GCC diagnostic pop + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + +} + +#endif // SCAL2V diff --git a/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c new file mode 100644 index 0000000000..7084e15cf5 --- /dev/null +++ b/kernels/sifive_x280/1/bli_scal2v_sifive_x280_intr/bli_scal2v_sifive_x280_intr_real.c @@ -0,0 +1,82 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SCAL2V + +SCAL2V(PRECISION_CHAR, void) +{ + // Computes y = alpha * conjx(x) + // == alpha * x (real case) + + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + if (*alpha == 0) { + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, y, incy, cntx); + return; + } + + if (*alpha == 1) { + COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx); + return; + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL) (y, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, xvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } +} + +#endif // SCAL2V diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c new file mode 100644 index 0000000000..b5788d632d --- /dev/null +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr.c @@ -0,0 +1,120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + +#define SCALV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##scalv_sifive_x280_intr(\ + conj_t conjalpha, \ + dim_t n, \ + const T* restrict alpha_, \ + T* restrict x_, inc_t incx, \ + const cntx_t* cntx \ +) + +#define SCALV(...) SCALV_(__VA_ARGS__) + +#define SETV_(PRECISION_CHAR) bli_##PRECISION_CHAR##setv_sifive_x280_asm +#define SETV(PRECISION_CHAR) SETV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_scalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_scalv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_scalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_scalv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SCALV +#undef SCALV_ diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..c6803c9676 --- /dev/null +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_complex.c @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SCALV + +SCALV(PRECISION_CHAR, void) +{ + // Computes x = conjalpha(alpha) * x + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0 || (alpha->real == 1 && alpha->imag == 0)) return; + + if (alpha->real == 0 && alpha->imag==0){ + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx); + return; + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + + RVV_TYPE_F(PREC, LMUL) temp_real = VFMUL_VF(PREC, LMUL)(xvec_real, alpha->real, vl); + RVV_TYPE_F(PREC, LMUL) temp_imag = VFMUL_VF(PREC, LMUL)(xvec_imag, alpha->real, vl); + if (conjalpha == BLIS_NO_CONJUGATE) { + temp_real = VFNMSAC_VF(PREC, LMUL)(temp_real, alpha->imag, xvec_imag, vl); + temp_imag = VFMACC_VF(PREC, LMUL)( temp_imag, alpha->imag, xvec_real, vl); + } else { + temp_real = VFMACC_VF(PREC, LMUL) (temp_real, alpha->imag, xvec_imag, vl); + temp_imag = VFNMSAC_VF(PREC, LMUL)(temp_imag, alpha->imag, xvec_real, vl); + } + + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, temp_real); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, temp_imag); + + if (incx == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, xvec, vl); + + x += vl*incx; + avl -= vl; + } + +} + +#endif // SCALV diff --git a/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..2b4e31d359 --- /dev/null +++ b/kernels/sifive_x280/1/bli_scalv_sifive_x280_intr/bli_scalv_sifive_x280_intr_real.c @@ -0,0 +1,76 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SCALV + +SCALV(PRECISION_CHAR, void) +{ + // Computes x = conjalpha(alpha) * x + // == alpha * x (real case) + + (void) conjalpha; // Suppress unused parameter warnings + const DATATYPE* restrict alpha = alpha_; + DATATYPE* restrict x = x_; + + if (n <= 0 || *alpha == 1) return; + + if (*alpha == 0){ + SETV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, alpha, x, incx, cntx); + return; + } + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + xvec = VFMUL_VF(PREC, LMUL)(xvec, *alpha, vl); + + if (incx == 1) + VSE_V_F(PREC, LMUL) (x, xvec, vl); + else + VSSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, xvec, vl); + + x += vl * incx; + avl -= vl; + } +} + +#endif // SCALV diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c new file mode 100644 index 0000000000..ef9091f16c --- /dev/null +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c @@ -0,0 +1,204 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, const cntx_t *cntx) { + (void)conjalpha; + (void)cntx; + const float* restrict alpha = alpha_; + float* restrict x = x_; + if (n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" + : + : "r"(n), "i"(8 * FLT_SIZE)); + __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); + incx *= FLT_SIZE; + + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(x)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, const cntx_t *cntx) { + (void)conjalpha; + (void)cntx; + const double* restrict alpha = alpha_; + double* restrict x = x_; + if (n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" + : + : "r"(n), "i"(8 * FLT_SIZE)); + __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); + incx *= FLT_SIZE; + + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(x)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define VLSE "vlse32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " + +void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, const cntx_t *cntx) { + (void)cntx; + const scomplex* restrict alpha = alpha_; + scomplex* restrict x = x_; + if (n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" + : + : "r"(n), "i"(8 * FLT_SIZE)); + __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); + __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(VLSE "v4, (t0), zero"); + if (conjalpha == BLIS_CONJUGATE) + __asm__("vfneg.v v4, v4"); + incx *= 2 * FLT_SIZE; + + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLSE +#undef VSSEG2 +#undef VSSSEG2 + +#define FLT_SIZE 8 +#define VLSE "vlse64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " + +void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict alpha_, + void * restrict x_, inc_t incx, const cntx_t *cntx) { + (void)cntx; + const dcomplex* restrict alpha = alpha_; + dcomplex* restrict x = x_; + if (n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" + : + : "r"(n), "i"(8 * FLT_SIZE)); + __asm__(VLSE "v0, (%0), zero" : : "r"(alpha)); + __asm__("addi t0, %0, %1" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(VLSE "v4, (t0), zero"); + if (conjalpha == BLIS_CONJUGATE) + __asm__("vfneg.v v4, v4"); + incx *= 2 * FLT_SIZE; + + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + avl -= vl; + } + return; +} diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c new file mode 100644 index 0000000000..e6b483a3f8 --- /dev/null +++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr.c @@ -0,0 +1,118 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define SUBV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##subv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + T* restrict y_, inc_t incy, \ + const cntx_t* cntx \ +) + +#define SUBV(...) SUBV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_subv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_subv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_subv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_subv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef SUBV +#undef SUBV_ diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..2d4a1a017f --- /dev/null +++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_complex.c @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SUBV + +SUBV(PRECISION_CHAR, void) +{ + // Computes y := y - conjx(x) + (void) cntx; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + yvec_real = VFSUB_VV(PREC, LMUL)(yvec_real, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE) + yvec_imag = VFSUB_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + else + yvec_imag = VFADD_VV(PREC, LMUL)(yvec_imag, xvec_imag, vl); + + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 0, yvec_real); + yvec = VSET_V_F(PREC, LMUL, 2)(yvec, 1, yvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, yvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, yvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } + +} + +#endif // SUBV diff --git a/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..b158594319 --- /dev/null +++ b/kernels/sifive_x280/1/bli_subv_sifive_x280_intr/bli_subv_sifive_x280_intr_real.c @@ -0,0 +1,77 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef SUBV + +SUBV(PRECISION_CHAR, void) +{ + // Computes y = y - conjx(x) + // == y - x (real case) + (void) cntx; + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFSUB_VV(PREC, LMUL)(yvec, xvec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL) (y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } +} + +#endif // SUBV diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c new file mode 100644 index 0000000000..2342e254a2 --- /dev/null +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c @@ -0,0 +1,245 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * restrict y_, + inc_t incy, const cntx_t *cntx) { + (void)cntx; + float* restrict x = x_; + float* restrict y = y_; + if (n <= 0) + return; + + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == FLT_SIZE) + __asm__(VLE "v8, (%0)" : : "r"(y)); + else + __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); + + if (incx == FLT_SIZE) + __asm__(VSE "v8, (%0)" : : "r"(x)); + else + __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)cntx; + double* restrict x = x_; + double* restrict y = y_; + if (n <= 0) + return; + + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == FLT_SIZE) + __asm__(VLE "v8, (%0)" : : "r"(y)); + else + __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); + + if (incx == FLT_SIZE) + __asm__(VSE "v8, (%0)" : : "r"(x)); + else + __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)cntx; + scomplex* restrict x = x_; + scomplex* restrict y = y_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * 2 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(x)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == 2 * FLT_SIZE) + __asm__(VLE "v8, (%0)" : : "r"(y)); + else + __asm__(VLSE "v8, (%0), %1" : : "r"(y), "r"(incy)); + + if (incx == 2 * FLT_SIZE) + __asm__(VSE "v8, (%0)" : : "r"(x)); + else + __asm__(VSSE "v8, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == 2 * FLT_SIZE) + __asm__(VSE "v0, (%0)" : : "r"(y)); + else + __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " + +void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, + void * restrict y_, inc_t incy, const cntx_t *cntx) { + (void)cntx; + dcomplex* restrict x = x_; + dcomplex* restrict y = y_; + if (n <= 0) + return; + + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v0, (%0)" : : "r"(x)); + else + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == 2 * FLT_SIZE) + __asm__(VLSEG2 "v8, (%0)" : : "r"(y)); + else + __asm__(VLSSEG2 "v8, (%0), %1" : : "r"(y), "r"(incy)); + + if (incx == 2 * FLT_SIZE) + __asm__(VSSEG2 "v8, (%0)" : : "r"(x)); + else + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); + if (incy == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(y)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c new file mode 100644 index 0000000000..dce4085bff --- /dev/null +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr.c @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define XPBYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##xpbyv_sifive_x280_intr(\ + conj_t conjx, \ + dim_t n, \ + const T* restrict x_, inc_t incx, \ + const T* restrict beta_, \ + T* restrict y_, inc_t incy, \ + const cntx_t* restrict cntx \ +) + +#define XPBYV(...) XPBYV_(__VA_ARGS__) + +#define COPYV_(PRECISION_CHAR) bli_##PRECISION_CHAR##copyv_sifive_x280_asm +#define COPYV(PRECISION_CHAR) COPYV_(PRECISION_CHAR) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_xpbyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_xpbyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_xpbyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_xpbyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef XPBYV +#undef XPBYV_ diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..4c86e8b36a --- /dev/null +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_complex.c @@ -0,0 +1,101 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef XPBYV + +XPBYV(PRECISION_CHAR, void) +{ + // Computes y = beta * y + conjx(x) + const DATATYPE* restrict beta = beta_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + if (beta->real == 0 && beta->imag == 0){ + COPYV(PRECISION_CHAR)(conjx, n, x, incx, y, incy, cntx); + return; + } + + // TO DO (optimization): beta = +-1, +-i special cases + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + + // xpbyv is computed with FMAs as follows: + // y[i].real = ( x[i].real + beta.real * y[i].real) - beta.imag * y[i].imag + // y[i].imag = (conjx(x[i].imag + beta.imag * y[i].real) + beta.real * y[i].imag + + xvec_real = VFMACC_VF( PREC, LMUL)(xvec_real, beta->real, yvec_real, vl); + xvec_real = VFNMSAC_VF(PREC, LMUL)(xvec_real, beta->imag, yvec_imag, vl); + if (conjx == BLIS_NO_CONJUGATE) + xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl); + else + xvec_imag = VFMSAC_VF(PREC, LMUL)(xvec_imag, beta->imag, yvec_real, vl); + xvec_imag = VFMACC_VF(PREC, LMUL)(xvec_imag, beta->real, yvec_imag, vl); + + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 0, xvec_real); + xvec = VSET_V_F(PREC, LMUL, 2)(xvec, 1, xvec_imag); + + if (incy == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, xvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, xvec, vl); + + x += vl*incx; + y += vl*incy; + avl -= vl; + } +} + +#endif // XPBYV diff --git a/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..b23272fea4 --- /dev/null +++ b/kernels/sifive_x280/1/bli_xpbyv_sifive_x280_intr/bli_xpbyv_sifive_x280_intr_real.c @@ -0,0 +1,84 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef XPBYV + +XPBYV(PRECISION_CHAR, void) +{ + // Computes y = beta * y + conjx(x) + // == beta * y + x (real case) + (void) conjx; // Suppress unused parameter warnings + const DATATYPE* restrict beta = beta_; + const DATATYPE* restrict x = x_; + DATATYPE* restrict y = y_; + + if (n <= 0) return; + + if (*beta == 0){ + COPYV(PRECISION_CHAR)(BLIS_NO_CONJUGATE, n, x, incx, y, incy, cntx); + return; + } + + // TO DO (optimization): beta = +-1 special cases + + size_t avl = n; + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL) (x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL) (y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + yvec = VFMADD_VF(PREC, LMUL)(yvec, *beta, xvec, vl); + + if (incy == 1) + VSE_V_F(PREC, LMUL) (y, yvec, vl); + else + VSSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, yvec, vl); + + x += vl * incx; + y += vl * incy; + avl -= vl; + } +} + +#endif // XPBYV diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c new file mode 100644 index 0000000000..1b5ce3b962 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr.c @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + + +#define AXPY2V_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##axpy2v_sifive_x280_intr(\ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const T* restrict alphax_, \ + const T* restrict alphay_, \ + const T* restrict x_, inc_t incx, \ + const T* restrict y_, inc_t incy, \ + T* restrict z_, inc_t incz, \ + const cntx_t* restrict cntx \ +) + +#define AXPY2V(...) AXPY2V_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpy2v_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpy2v_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_axpy2v_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_axpy2v_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef AXPY2V +#undef AXPY2V_ diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..9b57198272 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_complex.c @@ -0,0 +1,117 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPY2V + +AXPY2V(PRECISION_CHAR, void) +{ + // Computes z := z + alphax * conjx(x) + alphay * conjy(y) + const DATATYPE* restrict alphax = alphax_; + const DATATYPE* restrict alphay = alphay_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (n <= 0) + return; + + size_t avl = n; + + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag; + + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + + // + alphax * conjx(x) + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->real, xvec_real, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->imag, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE){ + zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphax->imag, xvec_imag, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphax->real, xvec_imag, vl); + } else { + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphax->imag, xvec_imag, vl); + zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphax->real, xvec_imag, vl); + } + + // + alphay * conjy(y) + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->real, yvec_real, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->imag, yvec_real, vl); + if (conjy == BLIS_NO_CONJUGATE){ + zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alphay->imag, yvec_imag, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alphay->real, yvec_imag, vl); + } else { + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alphay->imag, yvec_imag, vl); + zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alphay->real, yvec_imag, vl); + } + + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag); + + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl); + + x += vl*incx; + y += vl*incy; + z += vl*incz; + avl -= vl; + } + +} + +#endif // AXPY2V diff --git a/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c new file mode 100644 index 0000000000..cebb159973 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpy2v_sifive_x280_intr/bli_axpy2v_sifive_x280_intr_real.c @@ -0,0 +1,91 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef AXPY2V + +AXPY2V(PRECISION_CHAR, void) +{ + // Computes z := z + alphax * conjx(x) + alphay * conjy(y) + // == z + alphax * x + alphay * y (real case) + + (void) conjx; // Suppress unused parameter warnings + (void) conjy; + const DATATYPE* restrict alphax = alphax_; + const DATATYPE* restrict alphay = alphay_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + DATATYPE* restrict z = z_; + + if (n <= 0) + return; + + size_t avl = n; + + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec; + + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl); + + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphax, xvec, vl); + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alphay, yvec, vl); + + if (incz == 1) + VSE_V_F(PREC, LMUL)(z, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl); + + x += vl*incx; + y += vl*incy; + z += vl*incz; + avl -= vl; + } + +} + +#endif // AXPY2V diff --git a/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c new file mode 100644 index 0000000000..43c2ba44e2 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_axpyf_sifive_x280_asm.c @@ -0,0 +1,430 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_saxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, + const void *restrict alpha_, const void *restrict a_, inc_t inca, + inc_t lda, const void *restrict x_, inc_t incx, + void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { + (void)conja; + (void)conjx; + (void)cntx; + const float *restrict alpha = alpha_; + const float *restrict a = a_; + const float *restrict x = x_; + float *restrict y = y_; + + if (m == 0 || b == 0) + return; + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = m; + while (avl) { + // process vl elements of y at a time + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + // x_tmp traverses x + // a points to the vl x b block of a needed this iteration + // a_tmp traverses the columns of this block + const float* restrict x_tmp = x; + const float* restrict a_tmp = a; + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + if (inca == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + + for (dim_t i = 1; i < b; ++i) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + if (inca == FLT_SIZE) + __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmacc.vf v0, ft0, v24"); + } + + if (incy == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(y)); + __asm__("vfmacc.vf v24, ft11, v0"); + __asm__(VSE "v24, (%0)" : : "r"(y)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); + __asm__("vfmacc.vf v24, ft11, v0"); + __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); + } + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_daxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, + const void *restrict alpha_, const void *restrict a_, inc_t inca, + inc_t lda, const void *restrict x_, inc_t incx, + void *restrict y_, inc_t incy, const cntx_t *restrict cntx) { + (void)conja; + (void)conjx; + (void)cntx; + const double *restrict alpha = alpha_; + const double *restrict a = a_; + const double *restrict x = x_; + double *restrict y = y_; + + if (m == 0 || b == 0) + return; + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(alpha)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + size_t avl = m; + while (avl) { + // process vl elements of y at a time + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m8, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + // x_tmp traverses x + // a points to the vl x b block of a needed this iteration + // a_tmp traverses the columns of this block + const double* restrict x_tmp = x; + const double* restrict a_tmp = a; + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + if (inca == FLT_SIZE) + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSE "v0, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + + for (dim_t i = 1; i < b; ++i) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + if (inca == FLT_SIZE) + __asm__(VLE "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSE "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmacc.vf v0, ft0, v24"); + } + + if (incy == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(y)); + __asm__("vfmacc.vf v24, ft11, v0"); + __asm__(VSE "v24, (%0)" : : "r"(y)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(y), "r"(incy)); + __asm__("vfmacc.vf v24, ft11, v0"); + __asm__(VSSE "v24, (%0), %1" : : "r"(y), "r"(incy)); + } + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLSEG "vlseg2e32.v " +#define VLSSEG "vlsseg2e32.v " +#define VSSEG "vsseg2e32.v " +#define VSSSEG "vssseg2e32.v " + +void bli_caxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, + const void *restrict alpha_, const void *restrict a_, + inc_t inca, inc_t lda, const void *restrict x_, + inc_t incx, void *restrict y_, inc_t incy, + const cntx_t *restrict cntx) { + (void)cntx; + const scomplex *restrict alpha = alpha_; + const scomplex *restrict a = a_; + const scomplex *restrict x = x_; + scomplex *restrict y = y_; + + if (m == 0 || b == 0) + return; + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + size_t avl = m; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + const scomplex* restrict x_tmp = x; + const scomplex* restrict a_tmp = a; + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); + if (inca == 2 * FLT_SIZE) + __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmul.vf v0, v24, ft0"); + __asm__("vfmul.vf v4, v24, ft1"); + if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfmsac.vf v4, ft0, v28"); + } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } else { + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfnmacc.vf v4, ft0, v28"); + } + + for (dim_t i = 1; i < b; ++i) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); + if (inca == 2 * FLT_SIZE) + __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmacc.vf v0, ft0, v24"); + if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v4, ft1, v24"); + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { + __asm__("vfnmsac.vf v4, ft1, v24"); + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v4, ft1, v24"); + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE + __asm__("vfnmsac.vf v4, ft1, v24"); + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } + } + + if (incy == 2 * FLT_SIZE) { + __asm__(VLSEG "v24, (%0)" : : "r"(y)); + __asm__("vfmacc.vf v24, ft10, v0"); + __asm__("vfmacc.vf v28, ft10, v4"); + __asm__("vfnmsac.vf v24, ft11, v4"); + __asm__("vfmacc.vf v28, ft11, v0"); + __asm__(VSSEG "v24, (%0)" : : "r"(y)); + } else { + __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); + __asm__("vfmacc.vf v24, ft10, v0"); + __asm__("vfmacc.vf v28, ft10, v4"); + __asm__("vfnmsac.vf v24, ft11, v4"); + __asm__("vfmacc.vf v28, ft11, v0"); + __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); + } + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLSEG +#undef VLSSEG +#undef VSSEG +#undef VSSSEG + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLSEG "vlseg2e64.v " +#define VLSSEG "vlsseg2e64.v " +#define VSSEG "vsseg2e64.v " +#define VSSSEG "vssseg2e64.v " + +void bli_zaxpyf_sifive_x280_asm(conj_t conja, conj_t conjx, dim_t m, dim_t b, + const void *restrict alpha_, const void *restrict a_, + inc_t inca, inc_t lda, const void *restrict x_, + inc_t incx, void *restrict y_, inc_t incy, + const cntx_t *restrict cntx) { + (void)cntx; + const dcomplex *restrict alpha = alpha_; + const dcomplex *restrict a = a_; + const dcomplex *restrict x = x_; + dcomplex *restrict y = y_; + + if (m == 0 || b == 0) + return; + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + size_t avl = m; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" + : "=r"(vl) + : "r"(avl), "i"(8 * FLT_SIZE)); + const dcomplex* restrict x_tmp = x; + const dcomplex* restrict a_tmp = a; + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); + if (inca == 2 * FLT_SIZE) + __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmul.vf v0, v24, ft0"); + __asm__("vfmul.vf v4, v24, ft1"); + if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfmsac.vf v4, ft0, v28"); + } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } else { + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfnmacc.vf v4, ft0, v28"); + } + + for (dim_t i = 1; i < b; ++i) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x_tmp)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x_tmp), "I"(FLT_SIZE)); + if (inca == 2 * FLT_SIZE) + __asm__(VLSEG "v24, (%0)" : : "r"(a_tmp)); + else + __asm__(VLSSEG "v24, (%0), %1" : : "r"(a_tmp), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_tmp) : "r"(lda)); + __asm__("vfmacc.vf v0, ft0, v24"); + if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v4, ft1, v24"); + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_NO_CONJUGATE && conjx == BLIS_CONJUGATE) { + __asm__("vfnmsac.vf v4, ft1, v24"); + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfmacc.vf v4, ft0, v28"); + } else if (conja == BLIS_CONJUGATE && conjx == BLIS_NO_CONJUGATE) { + __asm__("vfmacc.vf v4, ft1, v24"); + __asm__("vfmacc.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } else { // conja == BLIS_CONJUGATE && conjx == BLIS_CONJUGATE + __asm__("vfnmsac.vf v4, ft1, v24"); + __asm__("vfnmsac.vf v0, ft1, v28"); + __asm__("vfnmsac.vf v4, ft0, v28"); + } + } + + if (incy == 2 * FLT_SIZE) { + __asm__(VLSEG "v24, (%0)" : : "r"(y)); + __asm__("vfmacc.vf v24, ft10, v0"); + __asm__("vfmacc.vf v28, ft10, v4"); + __asm__("vfnmsac.vf v24, ft11, v4"); + __asm__("vfmacc.vf v28, ft11, v0"); + __asm__(VSSEG "v24, (%0)" : : "r"(y)); + } else { + __asm__(VLSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); + __asm__("vfmacc.vf v24, ft10, v0"); + __asm__("vfmacc.vf v28, ft10, v4"); + __asm__("vfnmsac.vf v24, ft11, v4"); + __asm__("vfmacc.vf v28, ft11, v0"); + __asm__(VSSSEG "v24, (%0), %1" : : "r"(y), "r"(incy)); + } + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(vl * inca)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); + avl -= vl; + } + return; +} diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c new file mode 100644 index 0000000000..9cd1071d7a --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr.c @@ -0,0 +1,122 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off + +#include +#include +#include "blis.h" +#include "../../riscv_overloaded_intrinsics.h" + +#define DOTAXPYV_(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##dotaxpyv_sifive_x280_intr(\ + conj_t conjxt, \ + conj_t conjx, \ + conj_t conjy, \ + dim_t n, \ + const T* restrict alpha_, \ + const T* restrict x_, inc_t incx, \ + const T* restrict y_, inc_t incy, \ + T* restrict rho_, \ + T* restrict z_, inc_t incz, \ + const cntx_t* restrict cntx \ +) + +#define DOTAXPYV(...) DOTAXPYV_(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PREC 32 +#define LMUL m8 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotaxpyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PREC 64 +#define LMUL m8 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotaxpyv_sifive_x280_intr_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define BASE_DT float +#define PRECISION_CHAR c +#define PREC 32 +#define LMUL m4 +#define FLT_SIZE sizeof(float) + +#include "./bli_dotaxpyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define BASE_DT double +#define PRECISION_CHAR z +#define PREC 64 +#define LMUL m4 +#define FLT_SIZE sizeof(double) + +#include "./bli_dotaxpyv_sifive_x280_intr_complex.c" + +#undef DATATYPE +#undef BASE_DT +#undef PRECISION_CHAR +#undef PREC +#undef LMUL +#undef FLT_SIZE + +#undef DOTAXPYV +#undef DOTAXPYV_ diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c new file mode 100644 index 0000000000..c3cd06c523 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_complex.c @@ -0,0 +1,151 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTAXPYV + +DOTAXPYV(PRECISION_CHAR, void) +{ + // Computes z := z + alpha * conjx(x) + // and rho := conjxt(x)^T * conjy(y) + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + DATATYPE* restrict rho = rho_; + DATATYPE* restrict z = z_; + + if (n <= 0) + return; + + size_t avl = n; + bool first = true; + RVV_TYPE_F(PREC, LMUL) acc_real, acc_imag; + + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_FX(PREC, LMUL, 2) xvec, yvec, zvec; + RVV_TYPE_F(PREC, LMUL) xvec_real, xvec_imag, yvec_real, yvec_imag, zvec_real, zvec_imag; + + // Loads + if (incx == 1) + xvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) x, vl); + else + xvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) x, 2*FLT_SIZE*incx, vl); + + if (incy == 1) + yvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) y, vl); + else + yvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) y, 2*FLT_SIZE*incy, vl); + + if (incz == 1) + zvec = VLSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, vl); + else + zvec = VLSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, vl); + + xvec_real = VGET_V_F(PREC, LMUL, 2)(xvec, 0); + xvec_imag = VGET_V_F(PREC, LMUL, 2)(xvec, 1); + yvec_real = VGET_V_F(PREC, LMUL, 2)(yvec, 0); + yvec_imag = VGET_V_F(PREC, LMUL, 2)(yvec, 1); + zvec_real = VGET_V_F(PREC, LMUL, 2)(zvec, 0); + zvec_imag = VGET_V_F(PREC, LMUL, 2)(zvec, 1); + + // z := z + alpha * conjx(x) + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->real, xvec_real, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->imag, xvec_real, vl); + if (conjx == BLIS_NO_CONJUGATE){ + zvec_real = VFNMSAC_VF(PREC, LMUL)(zvec_real, alpha->imag, xvec_imag, vl); + zvec_imag = VFMACC_VF(PREC, LMUL)( zvec_imag, alpha->real, xvec_imag, vl); + } else { + zvec_real = VFMACC_VF(PREC, LMUL)( zvec_real, alpha->imag, xvec_imag, vl); + zvec_imag = VFNMSAC_VF(PREC, LMUL)(zvec_imag, alpha->real, xvec_imag, vl); + } + + // rho := conjxt(x)^T * conjy(y) + // We accumulate the current term of the dot product as (a*c-b*d) + (a*d+b*c)*i, + // conjugating when necessary + if (first) { + // Initialize real part: a*c + acc_real = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_real, vl); + // Initialize imaginary part: a*d + acc_imag = VFMUL_VV(PREC, LMUL)( xvec_real, yvec_imag, vl); + if (conjy == BLIS_CONJUGATE) + acc_imag = VFNEG_VF(PREC, LMUL)(acc_imag, vl); // TO DO: eliminate this negation + first = false; + } else { + // Accumulate real part: a*c + acc_real = VFMACC_VV_TU(PREC, LMUL)( acc_real, xvec_real, yvec_real, vl); + // Accumulate imaginary part: a*d + if (conjy == BLIS_NO_CONJUGATE) + acc_imag = VFMACC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl); + else + acc_imag = VFNMSAC_VV_TU(PREC, LMUL)(acc_imag, xvec_real, yvec_imag, vl); + } + // Finish real part: b*d + if (conjxt == BLIS_NO_CONJUGATE ^ conjy == BLIS_NO_CONJUGATE) + // Exactly one is conjugated => add + acc_real = VFMACC_VV_TU(PREC, LMUL)(acc_real, xvec_imag, yvec_imag, vl); + else + acc_real = VFNMSAC_VV_TU(PREC,LMUL)(acc_real, xvec_imag, yvec_imag, vl); + // Finish imaginary part: b*c + if (conjxt == BLIS_NO_CONJUGATE) + acc_imag = VFMACC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl); + else + acc_imag = VFNMSAC_VV_TU(PREC, LMUL)( acc_imag, xvec_imag, yvec_real, vl); + + // Stores + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 0, zvec_real); + zvec = VSET_V_F(PREC, LMUL, 2)(zvec, 1, zvec_imag); + + if (incz == 1) + VSSEG2_V_F(PREC, LMUL, 2)( (BASE_DT*) z, zvec, vl); + else + VSSSEG2_V_F(PREC, LMUL, 2)((BASE_DT*) z, 2*FLT_SIZE*incz, zvec, vl); + + x += vl*incx; + y += vl*incy; + z += vl*incz; + avl -= vl; + } + + // Compute rho + RVV_TYPE_F(PREC, m1) sum_real = VFMV_S_F(PREC, m1)( 0.f, 1); + RVV_TYPE_F(PREC, m1) sum_imag = VFMV_S_F(PREC, m1)( 0.f, 1); + sum_real = VF_REDUSUM_VS(PREC, LMUL)(acc_real, sum_real, n); + sum_imag = VF_REDUSUM_VS(PREC, LMUL)(acc_imag, sum_imag, n); + rho->real = VFMV_F_S(PREC)(sum_real); + rho->imag = VFMV_F_S(PREC)(sum_imag); + +} + +#endif // ifdef DOTAXPYV diff --git a/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c new file mode 100644 index 0000000000..adaf3610b0 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotaxpyv_sifive_x280_intr/bli_dotaxpyv_sifive_x280_intr_real.c @@ -0,0 +1,111 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef DOTAXPYV + +DOTAXPYV(PRECISION_CHAR, void) +{ + // Computes z := z + alpha * conjx(x) + // == z + alphax * x (real case) + // and rho := conjxt(x)^T * conjy(y) + // == x^T * y (real case) + + (void) conjx; // Suppress unused parameter warnings + (void) conjxt; + (void) conjy; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict x = x_; + const DATATYPE* restrict y = y_; + DATATYPE* restrict rho = rho_; + DATATYPE* restrict z = z_; + + if (n <= 0) + return; + + size_t avl = n; + bool first = true; + RVV_TYPE_F(PREC, LMUL) acc; + + while (avl) { + size_t vl = VSETVL(PREC, LMUL)(avl); + RVV_TYPE_F(PREC, LMUL) xvec, yvec, zvec; + + // Loads + if (incx == 1) + xvec = VLE_V_F(PREC, LMUL)(x, vl); + else + xvec = VLSE_V_F(PREC, LMUL)(x, FLT_SIZE * incx, vl); + + if (incy == 1) + yvec = VLE_V_F(PREC, LMUL)(y, vl); + else + yvec = VLSE_V_F(PREC, LMUL)(y, FLT_SIZE * incy, vl); + + if (incz == 1) + zvec = VLE_V_F(PREC, LMUL)(z, vl); + else + zvec = VLSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, vl); + + // z := z + alphax * x + zvec = VFMACC_VF(PREC, LMUL)(zvec, *alpha, xvec, vl); + + // rho := x^T * y + if (first){ + acc = VFMUL_VV(PREC, LMUL)( xvec, yvec, vl); + first = false; + } else { + acc = VFMACC_VV_TU(PREC, LMUL)( acc, xvec, yvec, vl); + } + + // Store + if (incz == 1) + VSE_V_F(PREC, LMUL)(z, zvec, vl); + else + VSSE_V_F(PREC, LMUL)(z, FLT_SIZE * incz, zvec, vl); + + x += vl*incx; + y += vl*incy; + z += vl*incz; + avl -= vl; + } + + // Compute rho + RVV_TYPE_F(PREC, m1) sum = VFMV_S_F(PREC, m1)( 0.f, 1); + sum = VF_REDUSUM_VS(PREC, LMUL)(acc, sum, n); + *rho = VFMV_F_S(PREC)(sum); + +} + +#endif // ifdef DOTAXPYV diff --git a/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c new file mode 100644 index 0000000000..ecb340707b --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxaxpyf_sifive_x280_asm.c @@ -0,0 +1,3120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "../riscv_cmul_macros_asm.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FMUL "fmul.s " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_sdotxaxpyf_sifive_x280_asm( + conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict w_, inc_t incw, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + void* restrict z_, inc_t incz, + const cntx_t* restrict cntx + ) { + (void)conjat; + (void)conja; + (void)conjw; + (void)conjx; + (void)cntx; + const float *restrict alpha = alpha_; + const float *restrict beta = beta_; + const float *restrict a = a_; + const float *restrict w = w_; + const float *restrict x = x_; + float *restrict y = y_; + float *restrict z = z_; + + if (b == 0) + return; + else if (m == 0 || *alpha == 0.f) { + // scale y by beta + if (*beta == 0.f) + bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incw *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + incz *= FLT_SIZE; + inc_t a_bump = 5 * lda; + while (b >= 5) { + // compute dot product of w with 5 rows of a + const float* restrict w_tmp = w; + const float* restrict z_tmp = z; + const float* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const float* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmul.vv v0, v24, v28"); + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmul.vv v16, v24, v28"); + first = false; + } + else { + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmacc.vv v16, v24, v28"); + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmul.vv v0, v24, v28"); + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmul.vv v16, v24, v28"); + first = false; + } + else { + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmacc.vv v16, v24, v28"); + } + } // end a non-unit stride + + if (incz == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 5; + } + + if (b > 0) { + const float* restrict w_tmp = w; + const float* restrict z_tmp = z; + const float* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); + size_t avl = m; + bool first = true; + while (avl) { + const float* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + __asm__("vmv.v.i v20, 0"); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + case 3: + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + case 2: + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + case 1: + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmul.vv v0, v24, v28"); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + case 3: + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + case 2: + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + case 1: + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmul.vv v0, v24, v28"); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a non-unit stride + + if (incz == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 4: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 3: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 2: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 1: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + } + } // end cleanup + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FMUL "fmul.d " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_ddotxaxpyf_sifive_x280_asm( + conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict w_, inc_t incw, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + void* restrict z_, inc_t incz, + const cntx_t* restrict cntx + ) { + (void)conjat; + (void)conja; + (void)conjw; + (void)conjx; + (void)cntx; + const double *restrict alpha = alpha_; + const double *restrict beta = beta_; + const double *restrict a = a_; + const double *restrict w = w_; + const double *restrict x = x_; + double *restrict y = y_; + double *restrict z = z_; + + if (b == 0) + return; + else if (m == 0 || *alpha == 0.) { + // scale y by beta + if (*beta == 0.) + bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incw *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + incz *= FLT_SIZE; + inc_t a_bump = 5 * lda; + while (b >= 5) { + // compute dot product of w with 5 rows of a + const double* restrict w_tmp = w; + const double* restrict z_tmp = z; + const double* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const double* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmul.vv v0, v24, v28"); + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmul.vv v16, v24, v28"); + first = false; + } + else { + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmacc.vv v16, v24, v28"); + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmul.vv v0, v24, v28"); + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmul.vv v16, v24, v28"); + first = false; + } + else { + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vf v20, v24, ft0"); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft4, v24"); + __asm__("vfmacc.vv v16, v24, v28"); + } + } // end a non-unit stride + + if (incz == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 5; + } + + if (b > 0) { + const double* restrict w_tmp = w; + const double* restrict z_tmp = z; + const double* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); + size_t avl = m; + bool first = true; + while (avl) { + const double* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + __asm__("vmv.v.i v20, 0"); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + case 3: + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + case 2: + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + case 1: + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmul.vv v0, v24, v28"); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft3, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmul.vv v12, v24, v28"); + case 3: + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmul.vv v8, v24, v28"); + case 2: + __asm__(FLT_LOAD "ft1, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : : "r"(x), "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmul.vv v4, v24, v28"); + case 1: + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmul.vv v0, v24, v28"); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft3, v24"); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft2, v24"); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vf v20, ft1, v24"); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vf v20, ft0, v24"); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a non-unit stride + + if (incz == FLT_SIZE) { + __asm__(VLE "v24, (%0)" : : "r"(z_tmp)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSE "v24, (%0)" : : "r"(z_tmp)); + } else { + __asm__(VLSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + __asm__("vfmacc.vf v24, ft10, v20"); + __asm__(VSSE "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 4: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 3: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 2: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 1: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + } + } // end cleanup + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FMUL "fmul.s " +#define FMADD "fmadd.s " +#define FNMSUB "fnmsub.s " +#define FNEG "fneg.s " +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define VSE "vse32.v " + +void bli_cdotxaxpyf_sifive_x280_asm + ( + conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict w_, inc_t incw, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + void* restrict z_, inc_t incz, + const cntx_t* restrict cntx + ) +{ + (void)cntx; + const scomplex *restrict alpha = alpha_; + const scomplex *restrict beta = beta_; + const scomplex *restrict a = a_; + const scomplex *restrict w = w_; + const scomplex *restrict x = x_; + scomplex *restrict y = y_; + scomplex *restrict z = z_; + + if (b == 0) + return; + else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { + // scale y by beta + if (beta->real == 0.f && beta->imag == 0.f) + bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, + // and fa6-fa7 to store beta + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); + __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); + // Reduce to case when A^T is not conjugated, then conjugate + // computed product A^T * w if needed. + conj_t conjatw = BLIS_NO_CONJUGATE; + if (conjat == BLIS_CONJUGATE) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjw); + bli_toggle_conj(&conjatw); + } + conj_t conjax = BLIS_NO_CONJUGATE; + if (conja == BLIS_CONJUGATE) { + bli_toggle_conj(&conja); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjax); + } + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incw *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + incz *= 2 * FLT_SIZE; + // these are used to bump a and y, resp. + inc_t a_bump = 5 * lda; + inc_t y_bump = incy - FLT_SIZE; + while (b >= 5) { + // compute dot product of w with 6 rows of a + const scomplex* restrict w_tmp = w; + const scomplex* restrict z_tmp = z; + const scomplex* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const scomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + if (inca == 2 * FLT_SIZE) { + if (conjw == BLIS_NO_CONJUGATE) { + // a unit stride, conjw = no conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a unit stride, conjw = conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjw == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjw = no conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a non-unit stride, conjw = conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_CONJUGATE + } // end a non-unit stride + + if (incz == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + // a += 5 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 5; + } + + if (b > 0) { + // cleanup loop, 0 < b < 5 + const scomplex* restrict w_tmp = w; + const scomplex* restrict z_tmp = z; + const scomplex* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); + size_t avl = m; + bool first = true; + while (avl) { + const scomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v22, 0"); + if (inca == 2 * FLT_SIZE) { + if (conjw == BLIS_NO_CONJUGATE) { + // a unit stride, conjw = no conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a unit stride, conjw = conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjw == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjw = no conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a non-unit stride, conjw = conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_CONJUGATE + } // end a non-unit stride + + if (incz == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + y_bump = incy + FLT_SIZE; + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 4: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 3: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 2: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 1: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + } + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef FMADD +#undef FNMSUB +#undef FNEG +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef VSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FMUL "fmul.d " +#define FMADD "fmadd.d " +#define FNMSUB "fnmsub.d " +#define FNEG "fneg.d " +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define VSE "vse64.v " + +void bli_zdotxaxpyf_sifive_x280_asm + ( + conj_t conjat, + conj_t conja, + conj_t conjw, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict w_, inc_t incw, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + void* restrict z_, inc_t incz, + const cntx_t* restrict cntx + ) +{ + (void)cntx; + const dcomplex *restrict alpha = alpha_; + const dcomplex *restrict beta = beta_; + const dcomplex *restrict a = a_; + const dcomplex *restrict w = w_; + const dcomplex *restrict x = x_; + dcomplex *restrict y = y_; + dcomplex *restrict z = z_; + + if (b == 0) + return; + else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { + // scale y by beta + if (beta->real == 0. && beta->imag == 0.) + bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + // use ft0-ft9 to store 5 entries of x, ft10-ft11 to store alpha, + // and fa6-fa7 to store beta + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "fa6, (%0)" : : "r"(beta)); + __asm__(FLT_LOAD "fa7, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); + // Reduce to case when A^T is not conjugated, then conjugate + // computed product A^T * w if needed. + conj_t conjatw = BLIS_NO_CONJUGATE; + if (conjat == BLIS_CONJUGATE) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjw); + bli_toggle_conj(&conjatw); + } + conj_t conjax = BLIS_NO_CONJUGATE; + if (conja == BLIS_CONJUGATE) { + bli_toggle_conj(&conja); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjax); + } + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incw *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + incz *= 2 * FLT_SIZE; + // these are used to bump a and y, resp. + inc_t a_bump = 5 * lda; + inc_t y_bump = incy - FLT_SIZE; + while (b >= 5) { + // compute dot product of w with 6 rows of a + const dcomplex* restrict w_tmp = w; + const dcomplex* restrict z_tmp = z; + const dcomplex* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const dcomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + if (inca == 2 * FLT_SIZE) { + if (conjw == BLIS_NO_CONJUGATE) { + // a unit stride, conjw = no conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a unit stride, conjw = conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjw == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjw = no conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a non-unit stride, conjw = conj + if (first) { + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft9, ft9"); } + __asm__("add %0, %0, %1" : "+r"(x) : "r"(incx)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vf(v20, v22, v24, v26, ft0, ft1); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft8, ft9, v24, v26); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + } + } // end conjw == BLIS_CONJUGATE + } // end a non-unit stride + + if (incz == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + // a += 5 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 5; + } + + if (b > 0) { + // cleanup loop, 0 < b < 5 + const dcomplex* restrict w_tmp = w; + const dcomplex* restrict z_tmp = z; + const dcomplex* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"((b - 1) * incx)); + size_t avl = m; + bool first = true; + while (avl) { + const dcomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incw == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(w_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(w_tmp), "r"(incw)); + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v22, 0"); + if (inca == 2 * FLT_SIZE) { + if (conjw == BLIS_NO_CONJUGATE) { + // a unit stride, conjw = no conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a unit stride, conjw = conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjw == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjw = no conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_NO_CONJUGATE + else { // conjw == BLIS_CONJUGATE + // a non-unit stride, conjw = conj + if (first) { + switch (b) { + case 4: + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft6, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft7, ft7"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft4, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft5, ft5"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft3, ft3"); } + __asm__("sub %0, %0, %1" : "+r"(x) : "r"(incx)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(x), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(x)); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + if (conjx == BLIS_CONJUGATE) { __asm__(FNEG "ft1, ft1"); } + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft6, ft7, v24, v26); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft4, ft5, v24, v26); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vf(v20, v22, ft2, ft3, v24, v26); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vf(v20, v22, ft0, ft1, v24, v26); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjw == BLIS_CONJUGATE + } // end a non-unit stride + + if (incz == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(z_tmp)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSEG2 "v24, (%0)" : : "r"(z_tmp)); + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + if (conjax == BLIS_NO_CONJUGATE) { + vcmacc_vf(v24, v26, ft10, ft11, v20, v22); + } + else { + vcmacc_vf_conj(v24, v26, ft10, ft11, v20, v22); + } + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(z_tmp), "r"(incz)); + } + + __asm__("add %0, %0, %1" : "+r"(w_tmp) : "r"(vl * incw)); + __asm__("add %0, %0, %1" : "+r"(z_tmp) : "r"(vl * incz)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + y_bump = incy + FLT_SIZE; + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 4: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 3: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 2: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 1: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatw == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft10, ft11); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft10, ft11); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, fa6, fa7, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatw == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft10, ft11, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft10, ft11, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + } + } + return; +} diff --git a/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c new file mode 100644 index 0000000000..5ac2d41667 --- /dev/null +++ b/kernels/sifive_x280/1f/bli_dotxf_sifive_x280_asm.c @@ -0,0 +1,2645 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "../riscv_cmul_macros_asm.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FMUL "fmul.s " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " + +void bli_sdotxf_sifive_x280_asm( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + const cntx_t* restrict cntx + ) { + // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) + // we process 6 elements of y per iteration, using y_tmp to load/store from + // y a points to the 6 x m block of a needed this iteration each 6 x m block + // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we + // use x_tmp to load from x a_row is used to load each of the 6 rows of this + // 6 x vl block + (void)conjat; + (void)conjx; + (void)cntx; + const float* restrict alpha = alpha_; + const float* restrict a = a_; + const float* restrict x = x_; + const float* restrict beta = beta_; + float* restrict y = y_; + + if (b == 0) + return; + else if (m == 0 || *alpha == 0.f) { + // scale y by beta + if (*beta == 0.f) + bli_ssetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_sscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + inc_t a_bump = 6 * lda; // to bump a down 6 rows + + while (b >= 6) { + // compute dot product of x with 6 rows of a + const float* restrict x_tmp = x; + const float* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const float* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + __asm__(VLE "v0, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v0, v0, v28"); + __asm__(VLE "v4, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + __asm__(VLE "v8, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + __asm__(VLE "v12, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + __asm__(VLE "v16, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + __asm__(VLE "v20, (%0)" : : "r"(a_row)); + __asm__("vfmul.vv v20, v20, v28"); + first = false; + } + else { + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vv v20, v24, v28"); + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v0, v0, v28"); + __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmul.vv v20, v20, v28"); + first = false; + } + else { + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vv v20, v24, v28"); + } + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v20, v20, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v20, v20, ft10"); + __asm__(VSE "v20, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v20"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + // a += 6 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 6; + } + + if (b > 0) { + // compute dot product of x with remaining < 6 rows of a + const float* restrict x_tmp = x; + // a_col will move along the last row of a! + const float* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + size_t avl = m; + bool first = true; + while (avl) { + const float* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + switch (b) { + case 5: + __asm__(VLE "v16, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + case 4: + __asm__(VLE "v12, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + case 3: + __asm__(VLE "v8, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + case 2: + __asm__(VLE "v4, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_row)); + __asm__("vfmul.vv v0, v0, v28"); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + case 4: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + switch (b) { + case 5: + __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + case 4: + __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + case 3: + __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + case 2: + __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + case 1: + __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmul.vv v0, v0, v28"); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + case 4: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + __asm__("vmv.s.x v31, x0"); + switch (b) { + case 5: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 4: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 3: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 2: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 1: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.f) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + } + } // end cleanup + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FMUL "fmul.d " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " + +void bli_ddotxf_sifive_x280_asm( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + const cntx_t* restrict cntx + ) { + // think of a as b x m row major matrix (i.e. rsa = lda, csa = inca) + // we process 6 elements of y per iteration, using y_tmp to load/store from + // y a points to the 6 x m block of a needed this iteration each 6 x m block + // is broken into 6 x vl blocks a_col points to the current 6 x vl block, we + // use x_tmp to load from x a_row is used to load each of the 6 rows of this + // 6 x vl block + (void)conjat; + (void)conjx; + (void)cntx; + const double* restrict alpha = alpha_; + const double* restrict a = a_; + const double* restrict x = x_; + const double* restrict beta = beta_; + double* restrict y = y_; + + if (b == 0) + return; + else if (m == 0 || *alpha == 0.) { + // scale y by beta + if (*beta == 0.) + bli_dsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_dscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + inca *= FLT_SIZE; + lda *= FLT_SIZE; + incx *= FLT_SIZE; + incy *= FLT_SIZE; + inc_t a_bump = 6 * lda; // to bump a down 6 rows + + while (b >= 6) { + // compute dot product of x with 6 rows of a + const double* restrict x_tmp = x; + const double* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const double* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + __asm__(VLE "v0, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v0, v0, v28"); + __asm__(VLE "v4, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + __asm__(VLE "v8, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + __asm__(VLE "v12, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + __asm__(VLE "v16, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + __asm__(VLE "v20, (%0)" : : "r"(a_row)); + __asm__("vfmul.vv v20, v20, v28"); + first = false; + } + else { + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vv v20, v24, v28"); + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v0, v0, v28"); + __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + __asm__(VLSE "v20, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmul.vv v20, v20, v28"); + first = false; + } + else { + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v0, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vv v20, v24, v28"); + } + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v20, v20, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v20, v20, ft10"); + __asm__(VSE "v20, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v20"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("add %0, %0, %1" : "+r"(y) : "r"(incy)); + + // a += 6 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 6; + } + + if (b > 0) { + // compute dot product of x with remaining < 6 rows of a + const double* restrict x_tmp = x; + // a_col will move along the last row of a! + const double* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + size_t avl = m; + bool first = true; + while (avl) { + const double* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == FLT_SIZE) + __asm__(VLE "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSE "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == FLT_SIZE) { + // a unit stride + if (first) { + switch (b) { + case 5: + __asm__(VLE "v16, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + case 4: + __asm__(VLE "v12, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + case 3: + __asm__(VLE "v8, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + case 2: + __asm__(VLE "v4, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_row)); + __asm__("vfmul.vv v0, v0, v28"); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + case 4: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLE "v24, (%0)" : : "r"(a_row)); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a unit stride + else { + // a non-unit stride + if (first) { + switch (b) { + case 5: + __asm__(VLSE "v16, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v16, v16, v28"); + case 4: + __asm__(VLSE "v12, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v12, v12, v28"); + case 3: + __asm__(VLSE "v8, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v8, v8, v28"); + case 2: + __asm__(VLSE "v4, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmul.vv v4, v4, v28"); + case 1: + __asm__(VLSE "v0, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmul.vv v0, v0, v28"); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v16, v24, v28"); + case 4: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v12, v24, v28"); + case 3: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v8, v24, v28"); + case 2: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + __asm__("vfmacc.vv v4, v24, v28"); + case 1: + __asm__(VLSE "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("vfmacc.vv v0, v24, v28"); + } + } + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + __asm__("vmv.s.x v31, x0"); + switch (b) { + case 5: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v16, v16, ft10"); + __asm__(VSE "v16, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v16"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 4: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v12, v12, ft10"); + __asm__(VSE "v12, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v12"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 3: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v8, v8, ft10"); + __asm__(VSE "v8, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v8"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 2: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v4, v4, ft10"); + __asm__(VSE "v4, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v4"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(incy)); + case 1: + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (*beta == 0.) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__(VSE "v0, (%0)" : : "r"(y)); + } + else { + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(y)); + __asm__(FMUL "ft0, ft11, ft0"); + __asm__("vfmv.s.f v30, ft0"); + __asm__("vfmacc.vf v30, ft10, v0"); + __asm__(VSE "v30, (%0)" : : "r"(y)); + } + } + } // end cleanup + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define FMUL "fmul.s " +#define FMADD "fmadd.s " +#define FNMSUB "fnmsub.s " +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define VSE "vse32.v " + +void bli_cdotxf_sifive_x280_asm( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + const cntx_t* restrict cntx + ) { + (void)cntx; + const scomplex* restrict alpha = alpha_; + const scomplex* restrict a = a_; + const scomplex* restrict x = x_; + const scomplex* restrict beta = beta_; + scomplex* restrict y = y_; + + if (b == 0) + return; + else if (m == 0 || (alpha->real == 0.f && alpha->imag == 0.f)) { + // scale y by beta + if (beta->real == 0.f && beta->imag == 0.f) + bli_csetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_cscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); + // Reduce to case when A^T is not conjugated, then conjugate + // computed product A^T * x if needed. + conj_t conjatx = BLIS_NO_CONJUGATE; + if (conjat == BLIS_CONJUGATE) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjatx); + } + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + // these are used to bump a and y, resp. + inc_t a_bump = 6 * lda; + inc_t y_bump = incy - FLT_SIZE; + while (b >= 6) { + // compute dot product of x with 6 rows of a + const scomplex* restrict x_tmp = x; + const scomplex* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const scomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == 2 * FLT_SIZE) { + if (conjx == BLIS_NO_CONJUGATE) { + // a unit stride, conjx = no conj + if (first) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a unit stride, conjx = conj + if (first) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv_conj(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjx == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjx = no conj + if (first) { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx = BLIS_CONJUGATE + // a non-unit stride, conjx = conj + if (first) { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv_conj(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_CONJUGATE + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v20, v20, v31"); + __asm__("vfredusum.vs v22, v22, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v20, v22, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v20, v22); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + // a += 6 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 6; + } + + if (b > 0) { + // cleanup loop, 0 < b < 6 + const scomplex* restrict x_tmp = x; + const scomplex* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + size_t avl = m; + bool first = true; + while (avl) { + const scomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == 2 * FLT_SIZE) { + if (conjx == BLIS_NO_CONJUGATE) { + // a unit stride, conjx = no conj + if (first) { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a unit stride, conjx = conj + if (first) { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjx == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjx = no conj + if (first) { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a non-unit stride, conjx = conj + if (first) { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_CONJUGATE + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + y_bump = incy + FLT_SIZE; + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 5: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 4: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 3: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 2: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 1: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0.f && beta->imag == 0.f) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + } + } // end cleanup + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef FMUL +#undef FMADD +#undef FNMSUB +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef VSE + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define FMUL "fmul.d " +#define FMADD "fmadd.d " +#define FNMSUB "fnmsub.d " +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define VSE "vse64.v " + +void bli_zdotxf_sifive_x280_asm( + conj_t conjat, + conj_t conjx, + dim_t m, + dim_t b, + const void* restrict alpha_, + const void* restrict a_, inc_t inca, inc_t lda, + const void* restrict x_, inc_t incx, + const void* restrict beta_, + void* restrict y_, inc_t incy, + const cntx_t* restrict cntx + ) { + (void)cntx; + const dcomplex* restrict alpha = alpha_; + const dcomplex* restrict a = a_; + const dcomplex* restrict x = x_; + const dcomplex* restrict beta = beta_; + dcomplex* restrict y = y_; + + if (b == 0) + return; + else if (m == 0 || (alpha->real == 0. && alpha->imag == 0.)) { + // scale y by beta + if (beta->real == 0. && beta->imag == 0.) + bli_zsetv_sifive_x280_asm(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + else + bli_zscalv_sifive_x280_intr(BLIS_NO_CONJUGATE, b, beta, y, incy, NULL); + return; + } + + __asm__(FLT_LOAD "ft8, (%0)" : : "r"(alpha)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(beta)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(beta), "I"(FLT_SIZE)); + // Reduce to case when A^T is not conjugated, then conjugate + // computed product A^T * x if needed. + conj_t conjatx = BLIS_NO_CONJUGATE; + if (conjat == BLIS_CONJUGATE) { + bli_toggle_conj(&conjat); + bli_toggle_conj(&conjx); + bli_toggle_conj(&conjatx); + } + inca *= 2 * FLT_SIZE; + lda *= 2 * FLT_SIZE; + incx *= 2 * FLT_SIZE; + incy *= 2 * FLT_SIZE; + // these are used to bump a and y, resp. + inc_t a_bump = 6 * lda; + inc_t y_bump = incy - FLT_SIZE; + while (b >= 6) { + // compute dot product of x with 6 rows of a + const dcomplex* restrict x_tmp = x; + const dcomplex* restrict a_col = a; + size_t avl = m; + bool first = true; + while (avl) { + const dcomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == 2 * FLT_SIZE) { + if (conjx == BLIS_NO_CONJUGATE) { + // a unit stride, conjx = no conj + if (first) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a unit stride, conjx = conj + if (first) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv_conj(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjx == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjx = no conj + if (first) { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx = BLIS_CONJUGATE + // a non-unit stride, conjx = conj + if (first) { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv_conj(v20, v22, v24, v26, v28, v30); + first = false; + } + else { + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("add %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv_conj(v20, v22, v24, v26, v28, v30); + } + } // end conjx == BLIS_CONJUGATE + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("vmv.s.x v31, x0"); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v20, v20, v31"); + __asm__("vfredusum.vs v22, v22, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v20, v22, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v20, v22, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v20, v22); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v20, v22); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(y_bump)); + + // a += 6 * lda; + __asm__("add %0, %0, %1" : "+r"(a) : "r"(a_bump)); + b -= 6; + } + + if (b > 0) { + // cleanup loop, 0 < b < 6 + const dcomplex* restrict x_tmp = x; + const dcomplex* restrict a_col; + __asm__("add %0, %1, %2" : "=r"(a_col) : "r"(a), "r"((b - 1) * lda)); + size_t avl = m; + bool first = true; + while (avl) { + const dcomplex* restrict a_row = a_col; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m2, tu, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + if (incx == 2 * FLT_SIZE) + __asm__(VLSEG2 "v28, (%0)" : : "r"(x_tmp)); + else + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(x_tmp), "r"(incx)); + if (inca == 2 * FLT_SIZE) { + if (conjx == BLIS_NO_CONJUGATE) { + // a unit stride, conjx = no conj + if (first) { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a unit stride, conjx = conj + if (first) { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(a_row)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_CONJUGATE + } // end a unit stride + else { // a non-unit stride + if (conjx == BLIS_NO_CONJUGATE) { + // a non-unit stride, conjx = no conj + if (first) { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_NO_CONJUGATE + else { // conjx == BLIS_CONJUGATE + // a non-unit stride, conjx = conj + if (first) { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmul_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmul_vv_conj(v0, v2, v24, v26, v28, v30); + } + first = false; + } + else { + switch (b) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v16, v18, v24, v26, v28, v30); + case 4: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v12, v14, v24, v26, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v8, v10, v24, v26, v28, v30); + case 2: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + __asm__("sub %0, %0, %1" : "+r"(a_row) : "r"(lda)); + vcmacc_vv_conj(v4, v6, v24, v26, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(a_row), "r"(inca)); + vcmacc_vv_conj(v0, v2, v24, v26, v28, v30); + } + } + } // end conjx == BLIS_CONJUGATE + } // end a non-unit stride + __asm__("add %0, %0, %1" : "+r"(x_tmp) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(a_col) : "r"(vl * inca)); + avl -= vl; + } + + __asm__("add %0, %0, %1" : "+r"(y) : "r"((b - 1) * incy)); + y_bump = incy + FLT_SIZE; + __asm__("vmv.s.x v31, x0"); + + switch (b) { + case 5: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v16, v16, v31"); + __asm__("vfredusum.vs v18, v18, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v16, v18, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v16, v18, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v16, v18); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v16, v18); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 4: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v12, v12, v31"); + __asm__("vfredusum.vs v14, v14, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v12, v14, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v12, v14, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v12, v14); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v12, v14); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 3: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v8, v8, v31"); + __asm__("vfredusum.vs v10, v10, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v8, v10, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v8, v10, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v8, v10); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v8, v10); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 2: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v4, v4, v31"); + __asm__("vfredusum.vs v6, v6, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v4, v6, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v4, v6, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v4, v6); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v4, v6); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + __asm__("sub %0, %0, %1" : "+r"(y) : "r"(y_bump)); + case 1: + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(m), "i"(8 * FLT_SIZE)); + __asm__("vfredusum.vs v0, v0, v31"); + __asm__("vfredusum.vs v2, v2, v31"); + __asm__("vsetivli zero, 1, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + if (beta->real == 0. && beta->imag == 0.) { + if (conjatx == BLIS_NO_CONJUGATE) { + vcmul_vf(v28, v29, v0, v2, ft8, ft9); + } + else { + vcmul_vf_conj(v28, v29, v0, v2, ft8, ft9); + } + } + else { + __asm__(FLT_LOAD "ft2, (%0)" : : "r"(y)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(y), "I"(FLT_SIZE)); + cmul(ft0, ft1, ft10, ft11, ft2, ft3); + __asm__("vfmv.s.f v28, ft0"); + __asm__("vfmv.s.f v29, ft1"); + if (conjatx == BLIS_NO_CONJUGATE) { + vcmacc_vf(v28, v29, ft8, ft9, v0, v2); + } + else { + vcmacc_vf_conj(v28, v29, ft8, ft9, v0, v2); + } + } + __asm__(VSE "v28, (%0)" : : "r"(y)); + __asm__("addi %0, %0, %1" : "+r"(y) : "I"(FLT_SIZE)); + __asm__(VSE "v29, (%0)" : : "r"(y)); + } + } // end cleanup + return; +} diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c new file mode 100644 index 0000000000..35ca23677d --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_mrxk.c @@ -0,0 +1,678 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "../riscv_cmul_macros_asm.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSSEG7 "vssseg7e32.v " + +void bli_spackm_sifive_x280_asm_7xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) conja; + (void) cntx; + const float* kappa = kappa_; + const float* a = a_; + float* p = p_; + + float kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 0: __asm__("vmv.v.i v0, 0"); + case 1: __asm__("vmv.v.i v1, 0"); + case 2: __asm__("vmv.v.i v2, 0"); + case 3: __asm__("vmv.v.i v3, 0"); + case 4: __asm__("vmv.v.i v4, 0"); + case 5: __asm__("vmv.v.i v5, 0"); + case 6: __asm__("vmv.v.i v6, 0"); + } + a += (cdim - 1) * inca; + size_t avl = n; + while (avl) { + const float* a_tmp = a; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 7: + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 6: + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast != 1.f) { + switch (cdim) { + case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + } + __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= FLT_SIZE; + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast != 1.f) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSE "v0, (%0)" : : "r"(p)); + a += lda; + p += ldp; + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSSEG7 + +#define FLT_SIZE 8 +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSSEG7 "vssseg7e64.v " + +void bli_dpackm_sifive_x280_asm_7xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) conja; + (void) cntx; + const double* kappa = kappa_; + const double* a = a_; + double* p = p_; + + double kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 0: __asm__("vmv.v.i v0, 0"); + case 1: __asm__("vmv.v.i v1, 0"); + case 2: __asm__("vmv.v.i v2, 0"); + case 3: __asm__("vmv.v.i v3, 0"); + case 4: __asm__("vmv.v.i v4, 0"); + case 5: __asm__("vmv.v.i v5, 0"); + case 6: __asm__("vmv.v.i v6, 0"); + } + a += (cdim - 1) * inca; + size_t avl = n; + while (avl) { + const double* a_tmp = a; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 7: + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 6: + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast != 1.) { + switch (cdim) { + case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + } + __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p), "r"(FLT_SIZE * ldp)); + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= FLT_SIZE; + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast != 1.) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSE "v0, (%0)" : : "r"(p)); + a += lda; + p += ldp; + } + __asm__ volatile("vsetivli zero, 7, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef VLE +#undef VLSE +#undef VSE +#undef VSSSEG7 + +#define FLT_SIZE 4 +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG6 "vssseg6e32.v " + +void bli_cpackm_sifive_x280_asm_6xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) cntx; + const scomplex* kappa = kappa_; + const scomplex* a = a_; + scomplex* p = p_; + + scomplex kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + switch (cdim) { + case 0: + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + case 1: + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v3, 0"); + case 2: + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v5, 0"); + case 3: + __asm__("vmv.v.i v6, 0"); + __asm__("vmv.v.i v7, 0"); + case 4: + __asm__("vmv.v.i v8, 0"); + __asm__("vmv.v.i v9, 0"); + case 5: + __asm__("vmv.v.i v10, 0"); + __asm__("vmv.v.i v11, 0"); + } + } + else { + switch (cdim) { + case 0: + __asm__("vmv.v.i v12, 0"); + __asm__("vmv.v.i v13, 0"); + case 1: + __asm__("vmv.v.i v14, 0"); + __asm__("vmv.v.i v15, 0"); + case 2: + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v17, 0"); + case 3: + __asm__("vmv.v.i v18, 0"); + __asm__("vmv.v.i v19, 0"); + case 4: + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v21, 0"); + case 5: + __asm__("vmv.v.i v22, 0"); + __asm__("vmv.v.i v23, 0"); + } + } + a += (cdim - 1) * inca; + size_t avl = n; + while (avl) { + const scomplex* a_tmp = a; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 6: + __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + if (conja == BLIS_CONJUGATE) { + switch (cdim) { + case 6: __asm__("vfneg.v v11, v11"); + case 5: __asm__("vfneg.v v9, v9"); + case 4: __asm__("vfneg.v v7, v7"); + case 3: __asm__("vfneg.v v5, v5"); + case 2: __asm__("vfneg.v v3, v3"); + case 1: __asm__("vfneg.v v1, v1"); + } + } + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); + __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + switch (cdim) { + case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); + case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); + case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); + case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + else { + switch (cdim) { + case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); + case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); + case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); + case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); + __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= 2 * FLT_SIZE; + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v3, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v1, v1"); + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); + } + a += lda; + p += ldp; + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG6 + +#define FLT_SIZE 8 +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG6 "vssseg6e64.v " + +void bli_zpackm_sifive_x280_asm_6xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) cntx; + const dcomplex* kappa = kappa_; + const dcomplex* a = a_; + dcomplex* p = p_; + + dcomplex kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + switch (cdim) { + case 0: + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + case 1: + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v3, 0"); + case 2: + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v5, 0"); + case 3: + __asm__("vmv.v.i v6, 0"); + __asm__("vmv.v.i v7, 0"); + case 4: + __asm__("vmv.v.i v8, 0"); + __asm__("vmv.v.i v9, 0"); + case 5: + __asm__("vmv.v.i v10, 0"); + __asm__("vmv.v.i v11, 0"); + } + } + else { + switch (cdim) { + case 0: + __asm__("vmv.v.i v12, 0"); + __asm__("vmv.v.i v13, 0"); + case 1: + __asm__("vmv.v.i v14, 0"); + __asm__("vmv.v.i v15, 0"); + case 2: + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v17, 0"); + case 3: + __asm__("vmv.v.i v18, 0"); + __asm__("vmv.v.i v19, 0"); + case 4: + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v21, 0"); + case 5: + __asm__("vmv.v.i v22, 0"); + __asm__("vmv.v.i v23, 0"); + } + } + a += (cdim - 1) * inca; + size_t avl = n; + while (avl) { + const dcomplex* a_tmp = a; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + switch (cdim) { + case 6: + __asm__(VLSEG2 "v10, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLSEG2 "v8, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + if (conja == BLIS_CONJUGATE) { + switch (cdim) { + case 6: __asm__("vfneg.v v11, v11"); + case 5: __asm__("vfneg.v v9, v9"); + case 4: __asm__("vfneg.v v7, v7"); + case 3: __asm__("vfneg.v v5, v5"); + case 2: __asm__("vfneg.v v3, v3"); + case 1: __asm__("vfneg.v v1, v1"); + } + } + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); + __asm__(VSSSEG6 "v6, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + switch (cdim) { + case 6: vcmul_vf2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); + case 5: vcmul_vf2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); + case 4: vcmul_vf2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); + case 3: vcmul_vf2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + else { + switch (cdim) { + case 6: vcmul_vf_conj2(v22, v23, v10, v11, kappa_cast.real, kappa_cast.imag); + case 5: vcmul_vf_conj2(v20, v21, v8, v9, kappa_cast.real, kappa_cast.imag); + case 4: vcmul_vf_conj2(v18, v19, v6, v7, kappa_cast.real, kappa_cast.imag); + case 3: vcmul_vf_conj2(v16, v17, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf_conj2(v14, v15, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf_conj2(v12, v13, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + __asm__(VSSSEG6 "v12, (%0), %1" : : "r"(p), "r"(2 * FLT_SIZE * ldp)); + __asm__(VSSSEG6 "v18, (%0), %1" : : "r"(p + 3), "r"(2 * FLT_SIZE * ldp)); + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= 2 * FLT_SIZE; + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v3, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m1, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v1, v1"); + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v2, v3, v0, v1, kappa_cast.real, kappa_cast.imag); + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v2, (%0)" : : "r"(p)); + } + a += lda; + p += ldp; + } + __asm__ volatile("vsetivli zero, 6, e%0, m1, ta, ma" : : "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v1, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} diff --git a/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c new file mode 100644 index 0000000000..89e05ecae3 --- /dev/null +++ b/kernels/sifive_x280/1m/bli_packm_sifive_x280_asm_nrxk.c @@ -0,0 +1,838 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include "../riscv_cmul_macros_asm.h" +#include +#include +#include +#include + +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " +#define VSSSEG8 "vssseg8e32.v " +#define VSSSEG7 "vssseg7e32.v " +#define VSSSEG6 "vssseg6e32.v " +#define VSSSEG5 "vssseg5e32.v " +#define VSSSEG4 "vssseg4e32.v " +#define VSSSEG3 "vssseg3e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define NR 64 + +void bli_spackm_sifive_x280_asm_64xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) conja; + (void) cntx; + const float* kappa = kappa_; + const float* a = a_; + float* p = p_; + + float kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v8, 0"); + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + dim_t cdim_tmp = cdim; + const float* a_tmp = a; + float* p_tmp = p; + while (cdim_tmp >= 8) { + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + if (kappa_cast != 1.f) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); + } + __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + p_tmp += 8; + cdim_tmp -= 8; + } + if (cdim_tmp > 0) { + a_tmp += (cdim_tmp - 1) * inca; + switch (cdim_tmp) { + case 7: + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 6: + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast != 1.f) { + switch (cdim_tmp) { + case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + } + switch (cdim_tmp) { + case 7: + __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 6: + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 5: + __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 4: + __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 3: + __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + } + p_tmp += cdim_tmp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); + for (size_t i = 0; i < vl; ++i) { + __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); + p_tmp += ldp; + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v8, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= FLT_SIZE; + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast != 1.f) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSE "v0, (%0)" : : "r"(p)); + a += lda; + p += ldp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE +#undef VSSSEG8 +#undef VSSSEG7 +#undef VSSSEG6 +#undef VSSSEG5 +#undef VSSSEG4 +#undef VSSSEG3 +#undef VSSSEG2 +#undef NR + +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " +#define VSSSEG8 "vssseg8e64.v " +#define VSSSEG7 "vssseg7e64.v " +#define VSSSEG6 "vssseg6e64.v " +#define VSSSEG5 "vssseg5e64.v " +#define VSSSEG4 "vssseg4e64.v " +#define VSSSEG3 "vssseg3e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define NR 32 + +void bli_dpackm_sifive_x280_asm_32xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) conja; + (void) cntx; + const double* kappa = kappa_; + const double* a = a_; + double* p = p_; + + double kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v8, 0"); + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + dim_t cdim_tmp = cdim; + const double* a_tmp = a; + double* p_tmp = p; + while (cdim_tmp >= 8) { + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLE "v7, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + if (kappa_cast != 1.) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + __asm__("vfmul.vf v7, v7, %0" : : "f"(kappa_cast)); + } + __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + p_tmp += 8; + cdim_tmp -= 8; + } + if (cdim_tmp > 0) { + a_tmp += (cdim_tmp - 1) * inca; + switch (cdim_tmp) { + case 7: + __asm__(VLE "v6, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 6: + __asm__(VLE "v5, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 5: + __asm__(VLE "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 4: + __asm__(VLE "v3, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 3: + __asm__(VLE "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLE "v1, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLE "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast != 1.) { + switch (cdim_tmp) { + case 7: __asm__("vfmul.vf v6, v6, %0" : : "f"(kappa_cast)); + case 6: __asm__("vfmul.vf v5, v5, %0" : : "f"(kappa_cast)); + case 5: __asm__("vfmul.vf v4, v4, %0" : : "f"(kappa_cast)); + case 4: __asm__("vfmul.vf v3, v3, %0" : : "f"(kappa_cast)); + case 3: __asm__("vfmul.vf v2, v2, %0" : : "f"(kappa_cast)); + case 2: __asm__("vfmul.vf v1, v1, %0" : : "f"(kappa_cast)); + case 1: __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + } + switch (cdim_tmp) { + case 7: + __asm__(VSSSEG7 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 6: + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 5: + __asm__(VSSSEG5 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 4: + __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 3: + __asm__(VSSSEG3 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(p_tmp), "r"(FLT_SIZE * ldp)); + break; + } + p_tmp += cdim_tmp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); + for (size_t i = 0; i < vl; ++i) { + __asm__(VSE "v8, (%0)" : : "r"(p_tmp)); + p_tmp += ldp; + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v8, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= FLT_SIZE; + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m4, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == FLT_SIZE) { + __asm__(VLE "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSE "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast != 1.) { + __asm__("vfmul.vf v0, v0, %0" : : "f"(kappa_cast)); + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSE "v0, (%0)" : : "r"(p)); + a += lda; + p += ldp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSE "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE +#undef VSSSEG8 +#undef VSSSEG7 +#undef VSSSEG6 +#undef VSSSEG5 +#undef VSSSEG4 +#undef VSSSEG3 +#undef VSSSEG2 +#undef NR + +#define FLT_SIZE 4 +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define VSSSEG4 "vssseg4e32.v " +#define VSSSEG6 "vssseg6e32.v " +#define VSSSEG8 "vssseg8e32.v " +#define NR 32 + +void bli_cpackm_sifive_x280_asm_32xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) cntx; + const scomplex* kappa = kappa_; + const scomplex* a = a_; + scomplex* p = p_; + + scomplex kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v18, 0"); + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + dim_t cdim_tmp = cdim; + const scomplex* a_tmp = a; + scomplex* p_tmp = p; + while (cdim_tmp >= 4) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v1, v1"); + __asm__("vfneg.v v3, v3"); + __asm__("vfneg.v v5, v5"); + __asm__("vfneg.v v7, v7"); + } + __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); + } + __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + } + p_tmp += 4; + cdim_tmp -= 4; + } + if (cdim_tmp > 0) { + a_tmp += (cdim_tmp - 1) * inca; + switch (cdim_tmp) { + case 3: + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + if (conja == BLIS_CONJUGATE) { + switch (cdim_tmp) { + case 3: __asm__("vfneg.v v5, v5"); + case 2: __asm__("vfneg.v v3, v3"); + case 1: __asm__("vfneg.v v1, v1"); + } + } + switch (cdim_tmp) { + case 3: + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + } + } + else { + if (conja == BLIS_NO_CONJUGATE) { + switch (cdim_tmp) { + case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + else { + switch (cdim_tmp) { + case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + switch (cdim_tmp) { + case 3: + __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + } + } + p_tmp += cdim_tmp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); + for (size_t i = 0; i < vl; ++i) { + __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); + p_tmp += ldp; + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= 2 * FLT_SIZE; + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v6, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast.real == 1.f && kappa_cast.imag == 0.f) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v2, v2"); + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); + } + a += lda; + p += ldp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} + +#undef FLT_SIZE +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef VSSSEG4 +#undef VSSSEG6 +#undef VSSSEG8 +#undef NR + +#define FLT_SIZE 8 +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define VSSSEG4 "vssseg4e64.v " +#define VSSSEG6 "vssseg6e64.v " +#define VSSSEG8 "vssseg8e64.v " +#define NR 16 + +void bli_zpackm_sifive_x280_asm_16xk + ( + conj_t conja, + pack_t schema, + dim_t cdim, + dim_t n, + dim_t n_max, + const void* restrict kappa_, + const void* restrict a_, inc_t inca, inc_t lda, + void* restrict p_, inc_t ldp, + const cntx_t* cntx + ) +{ + (void) cntx; + const dcomplex* kappa = kappa_; + const dcomplex* a = a_; + dcomplex* p = p_; + + dcomplex kappa_cast = *kappa; + if (lda == 1) { + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v18, 0"); + size_t avl = n; + while (avl) { + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m1, ta, ma" : "=r"(vl) : "r"(avl), "i"(8 * FLT_SIZE)); + dim_t cdim_tmp = cdim; + const dcomplex* a_tmp = a; + dcomplex* p_tmp = p; + while (cdim_tmp >= 4) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + __asm__(VLSEG2 "v6, (%0)" : : "r"(a_tmp)); + a_tmp += inca; + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v1, v1"); + __asm__("vfneg.v v3, v3"); + __asm__("vfneg.v v5, v5"); + __asm__("vfneg.v v7, v7"); + } + __asm__(VSSSEG8 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + vcmul_vf2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + vcmul_vf_conj2(v14, v15, v6, v7, kappa_cast.real, kappa_cast.imag); + } + __asm__(VSSSEG8 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + } + p_tmp += 4; + cdim_tmp -= 4; + } + if (cdim_tmp > 0) { + a_tmp += (cdim_tmp - 1) * inca; + switch (cdim_tmp) { + case 3: + __asm__(VLSEG2 "v4, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 2: + __asm__(VLSEG2 "v2, (%0)" : : "r"(a_tmp)); + a_tmp -= inca; + case 1: + __asm__(VLSEG2 "v0, (%0)" : : "r"(a_tmp)); + } + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + if (conja == BLIS_CONJUGATE) { + switch (cdim_tmp) { + case 3: __asm__("vfneg.v v5, v5"); + case 2: __asm__("vfneg.v v3, v3"); + case 1: __asm__("vfneg.v v1, v1"); + } + } + switch (cdim_tmp) { + case 3: + __asm__(VSSSEG6 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG4 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + } + } + else { + if (conja == BLIS_NO_CONJUGATE) { + switch (cdim_tmp) { + case 3: vcmul_vf2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + else { + switch (cdim_tmp) { + case 3: vcmul_vf_conj2(v12, v13, v4, v5, kappa_cast.real, kappa_cast.imag); + case 2: vcmul_vf_conj2(v10, v11, v2, v3, kappa_cast.real, kappa_cast.imag); + case 1: vcmul_vf_conj2(v8, v9, v0, v1, kappa_cast.real, kappa_cast.imag); + } + } + switch (cdim_tmp) { + case 3: + __asm__(VSSSEG6 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 2: + __asm__(VSSSEG4 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + case 1: + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(p_tmp), "r"(2 * FLT_SIZE * ldp)); + break; + } + } + p_tmp += cdim_tmp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR - cdim), "i"(8 * FLT_SIZE)); + for (size_t i = 0; i < vl; ++i) { + __asm__(VSSEG2 "v16, (%0)" : : "r"(p_tmp)); + p_tmp += ldp; + } + a += vl; + p += vl * ldp; + avl -= vl; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v16, (%0)" : : "r"(p)); + p += ldp; + } + } + else { + inca *= 2 * FLT_SIZE; + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v6, 0"); + for (size_t i = 0; i < n; ++i) { + __asm__ volatile("vsetvli zero, %0, e%1, m2, tu, ma" : : "r"(cdim), "i"(8 * FLT_SIZE)); + if (inca == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(a)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(a), "r"(inca)); + } + if (kappa_cast.real == 1. && kappa_cast.imag == 0.) { + if (conja == BLIS_CONJUGATE) { + __asm__("vfneg.v v2, v2"); + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + } + else { + if (conja == BLIS_NO_CONJUGATE) { + vcmul_vf2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); + } + else { + vcmul_vf_conj2(v4, v6, v0, v2, kappa_cast.real, kappa_cast.imag); + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__(VSSEG2 "v4, (%0)" : : "r"(p)); + } + a += lda; + p += ldp; + } + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(NR), "i"(8 * FLT_SIZE)); + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + for (size_t i = n; i < n_max; ++i) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(p)); + p += ldp; + } + } + return; +} diff --git a/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c new file mode 100644 index 0000000000..b9715988d6 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemm_sifive_x280_asm.c @@ -0,0 +1,2405 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "blis.h" +#include "../riscv_cmul_macros_asm.h" +#include +#include +#include +#include + +// byte-size of the floating point type +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLE "vle32.v " +#define VLSE "vlse32.v " +#define VSE "vse32.v " +#define VSSE "vsse32.v " +#define PACKMR 8 +#define PACKNR 64 + +void bli_sgemm_7m4 + ( + dim_t N, + dim_t K, + const float* restrict alpha, + const float* restrict a, + const float* restrict b, + const float* restrict beta, + float* restrict c, inc_t rsc, inc_t csc + ) +{ + // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0 + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + bool first = true; + // compute a*b + for (dim_t k = 0; k < K; ++k) { + __asm__(VLE "v28, (%0)" : : "r"(b)); + if (first) { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmul.vf v0, v28, ft0"); + + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmul.vf v4, v28, ft1"); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmul.vf v8, v28, ft2"); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmul.vf v12, v28, ft3"); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmul.vf v16, v28, ft4"); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmul.vf v20, v28, ft5"); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__("vfmul.vf v24, v28, ft6"); + + first = false; + } + else { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmacc.vf v0, ft0, v28"); + + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmacc.vf v4, ft1, v28"); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmacc.vf v8, ft2, v28"); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmacc.vf v12, ft3, v28"); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmacc.vf v16, ft4, v28"); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmacc.vf v20, ft5, v28"); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__("vfmacc.vf v24, ft6, v28"); + } + + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); + } + + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + + // compute alpha*a*b + beta*c + if (*beta == 0.f) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("vfmul.vf v24, v24, ft10"); + } + else { // beta != 0.f + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + float *c_tmp = c; + if (csc == FLT_SIZE) { // c unit column stride + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v0, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v24, v24, ft10"); + __asm__("vfmacc.vf v24, ft11, v28"); + } // end c unit column stride + else { // c non-unit column stride + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v0, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v24, v24, ft10"); + __asm__("vfmacc.vf v24, ft11, v28"); + } // end c non-unit column stride + } // end beta != 0.f + + // store c + if (csc == FLT_SIZE) { + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v24, (%0)" : : "r"(c)); + } + else { + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + } + + return; +} + +void bli_sgemm_7m4_cleanup + ( + dim_t M, + dim_t N, + dim_t K, + const float* restrict alpha, + const float* restrict a, + const float* restrict b, + const float* restrict beta, + float* restrict c, inc_t rsc, inc_t csc + ) +{ + // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + bool first = true; + // compute a*b + for (dim_t k = 0; k < K; ++k) { + __asm__(VLE "v28, (%0)" : : "r"(b)); + if (first) { + switch (M) { + case 6: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmul.vf v20, v28, ft5"); + case 5: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmul.vf v16, v28, ft4"); + case 4: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmul.vf v12, v28, ft3"); + case 3: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmul.vf v8, v28, ft2"); + case 2: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmul.vf v4, v28, ft1"); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmul.vf v0, v28, ft0"); + } + first = false; + } + else { + switch (M) { + case 6: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmacc.vf v20, ft5, v28"); + case 5: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmacc.vf v16, ft4, v28"); + case 4: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmacc.vf v12, ft3, v28"); + case 3: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmacc.vf v8, ft2, v28"); + case 2: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmacc.vf v4, ft1, v28"); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmacc.vf v0, ft0, v28"); + } + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); + } + + c += (M - 1) * rsc; + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + + // compute alpha*a*b + beta*c + if (*beta == 0.f) { + switch (M) { + case 6: + __asm__("vfmul.vf v20, v20, ft10"); + case 5: + __asm__("vfmul.vf v16, v16, ft10"); + case 4: + __asm__("vfmul.vf v12, v12, ft10"); + case 3: + __asm__("vfmul.vf v8, v8, ft10"); + case 2: + __asm__("vfmul.vf v4, v4, ft10"); + case 1: + __asm__("vfmul.vf v0, v0, ft10"); + } + } + else { // beta != 0.f + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + float *c_tmp = c; + if (csc == FLT_SIZE) { + switch (M) { + case 6: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + case 5: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + case 4: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + case 3: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + case 2: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + case 1: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmacc.vf v0, ft11, v28"); + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 6: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + case 5: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + case 4: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + case 3: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + case 2: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + case 1: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmacc.vf v0, ft11, v28"); + } + } // end c non-unit column stride + } // end beta != 0.f + + // store c + if (csc == FLT_SIZE) { + switch (M) { + case 6: + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSE "v0, (%0)" : : "r"(c)); + } + } + else { + switch (M) { + case 6: + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } + return; +} + +void bli_sgemm_7m4_k0 + ( + dim_t M, + dim_t N, + const float* restrict beta, + float* restrict c, inc_t rsc, inc_t csc + ) +{ + // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 + // This may not produce the same result as the reference kernel if alpha is infinite or NaN. + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + c += (M - 1) * rsc; + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + if (*beta == 0.f) { + // set c to 0 + __asm__("vmv.v.i v0, 0"); + if (csc == FLT_SIZE) { // c unit column stride + switch (M) { + case 7: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSE "v0, (%0)" : : "r"(c)); + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 7: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } // end c non-unit column stride + } // end beta == 0.f + else { // beta != 0.f + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); + if (csc == FLT_SIZE) { // c unit column stride + switch (M) { + case 7: + __asm__(VLE "v24, (%0)" : : "r"(c)); + __asm__("vfmul.vf v24, v24, ft0"); + __asm__(VSE "v24, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VLE "v20, (%0)" : : "r"(c)); + __asm__("vfmul.vf v20, v20, ft0"); + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VLE "v16, (%0)" : : "r"(c)); + __asm__("vfmul.vf v16, v16, ft0"); + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VLE "v12, (%0)" : : "r"(c)); + __asm__("vfmul.vf v12, v12, ft0"); + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VLE "v8, (%0)" : : "r"(c)); + __asm__("vfmul.vf v8, v8, ft0"); + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VLE "v4, (%0)" : : "r"(c)); + __asm__("vfmul.vf v4, v4, ft0"); + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VLE "v0, (%0)" : : "r"(c)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__(VSE "v0, (%0)" : : "r"(c)); + + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 7: + __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v24, v24, ft0"); + __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft0"); + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft0"); + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft0"); + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft0"); + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft0"); + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } // end c non-unit column stride + } // end beta != 0.f + return; +} + +void bli_sgemm_sifive_x280_asm_7m4 + ( + dim_t M, + dim_t N, + dim_t K, + const void* restrict alpha_, + const void* restrict a_, + const void* restrict b_, + const void* restrict beta_, + void* restrict c_, inc_t rsc, inc_t csc, + auxinfo_t* restrict data, + const cntx_t* restrict cntx + ) +{ + (void) data; + (void) cntx; + const float* restrict alpha = alpha_; + const float* restrict beta = beta_; + const float* restrict a = a_; + const float* restrict b = b_; + float* restrict c = c_; + + // M x N x K sgemm + if (M <= 0 || N <= 0 || K < 0) + return; + else if (K == 0) + bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc); + else if (M == 7) + bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); + else + bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE +#undef PACKMR +#undef PACKNR + +// byte-size of the floating point type +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLE "vle64.v " +#define VLSE "vlse64.v " +#define VSE "vse64.v " +#define VSSE "vsse64.v " +#define PACKMR 8 +#define PACKNR 32 + +void bli_dgemm_7m4 + ( + dim_t N, + dim_t K, + const double* restrict alpha, + const double* restrict a, + const double* restrict b, + const double* restrict beta, + double* restrict c, inc_t rsc, inc_t csc + ) +{ + // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0 + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + bool first = true; + // compute a*b + for (dim_t k = 0; k < K; ++k) { + __asm__(VLE "v28, (%0)" : : "r"(b)); + if (first) { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmul.vf v0, v28, ft0"); + + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmul.vf v4, v28, ft1"); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmul.vf v8, v28, ft2"); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmul.vf v12, v28, ft3"); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmul.vf v16, v28, ft4"); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmul.vf v20, v28, ft5"); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__("vfmul.vf v24, v28, ft6"); + + first = false; + } + else { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmacc.vf v0, ft0, v28"); + + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmacc.vf v4, ft1, v28"); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmacc.vf v8, ft2, v28"); + + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmacc.vf v12, ft3, v28"); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmacc.vf v16, ft4, v28"); + + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmacc.vf v20, ft5, v28"); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__("vfmacc.vf v24, ft6, v28"); + } + + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); + } + + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + + // compute alpha*a*b + beta*c + if (*beta == 0.) { + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("vfmul.vf v24, v24, ft10"); + } + else { // beta != 0. + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + double *c_tmp = c; + if (csc == FLT_SIZE) { // c unit column stride + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v0, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v24, v24, ft10"); + __asm__("vfmacc.vf v24, ft11, v28"); + } // end c unit column stride + else { // c non-unit column stride + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v0, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v24, v24, ft10"); + __asm__("vfmacc.vf v24, ft11, v28"); + } // end c non-unit column stride + } // end beta != 0. + + // store c + if (csc == FLT_SIZE) { + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSE "v24, (%0)" : : "r"(c)); + } + else { + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + } + + return; +} + +void bli_dgemm_7m4_cleanup + ( + dim_t M, + dim_t N, + dim_t K, + const double* restrict alpha, + const double* restrict a, + const double* restrict b, + const double* restrict beta, + double* restrict c, inc_t rsc, inc_t csc + ) +{ + // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 + __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + bool first = true; + // compute a*b + for (dim_t k = 0; k < K; ++k) { + __asm__(VLE "v28, (%0)" : : "r"(b)); + if (first) { + switch (M) { + case 6: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmul.vf v20, v28, ft5"); + case 5: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmul.vf v16, v28, ft4"); + case 4: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmul.vf v12, v28, ft3"); + case 3: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmul.vf v8, v28, ft2"); + case 2: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmul.vf v4, v28, ft1"); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmul.vf v0, v28, ft0"); + } + first = false; + } + else { + switch (M) { + case 6: + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + __asm__("vfmacc.vf v20, ft5, v28"); + case 5: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__("vfmacc.vf v16, ft4, v28"); + case 4: + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + __asm__("vfmacc.vf v12, ft3, v28"); + case 3: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__("vfmacc.vf v8, ft2, v28"); + case 2: + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + __asm__("vfmacc.vf v4, ft1, v28"); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__("vfmacc.vf v0, ft0, v28"); + } + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); + } + + c += (M - 1) * rsc; + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + + __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); + + // compute alpha*a*b + beta*c + if (*beta == 0.) { + switch (M) { + case 6: + __asm__("vfmul.vf v20, v20, ft10"); + case 5: + __asm__("vfmul.vf v16, v16, ft10"); + case 4: + __asm__("vfmul.vf v12, v12, ft10"); + case 3: + __asm__("vfmul.vf v8, v8, ft10"); + case 2: + __asm__("vfmul.vf v4, v4, ft10"); + case 1: + __asm__("vfmul.vf v0, v0, ft10"); + } + } + else { // beta != 0. + __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); + double *c_tmp = c; + if (csc == FLT_SIZE) { + switch (M) { + case 6: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + case 5: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + case 4: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + case 3: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + case 2: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + case 1: + __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmacc.vf v0, ft11, v28"); + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 6: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v20, ft11, v28"); + case 5: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v16, ft11, v28"); + case 4: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v12, ft11, v28"); + case 3: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v8, ft11, v28"); + case 2: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft10"); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__("vfmacc.vf v4, ft11, v28"); + case 1: + __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft10"); + __asm__("vfmacc.vf v0, ft11, v28"); + } + } // end c non-unit column stride + } // end beta != 0. + + // store c + if (csc == FLT_SIZE) { + switch (M) { + case 6: + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSE "v0, (%0)" : : "r"(c)); + } + } + else { + switch (M) { + case 6: + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } + return; +} + +void bli_dgemm_7m4_k0 + ( + dim_t M, + dim_t N, + const double* restrict beta, + double* restrict c, inc_t rsc, inc_t csc + ) +{ + // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 + // This may not produce the same result as the reference kernel if alpha is infinite or NaN. + __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + c += (M - 1) * rsc; + rsc *= FLT_SIZE; + csc *= FLT_SIZE; + if (*beta == 0.) { + // set c to 0 + __asm__("vmv.v.i v0, 0"); + if (csc == FLT_SIZE) { // c unit column stride + switch (M) { + case 7: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSE "v0, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSE "v0, (%0)" : : "r"(c)); + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 7: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } // end c non-unit column stride + } // end beta == 0. + else { // beta != 0. + __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); + if (csc == FLT_SIZE) { // c unit column stride + switch (M) { + case 7: + __asm__(VLE "v24, (%0)" : : "r"(c)); + __asm__("vfmul.vf v24, v24, ft0"); + __asm__(VSE "v24, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VLE "v20, (%0)" : : "r"(c)); + __asm__("vfmul.vf v20, v20, ft0"); + __asm__(VSE "v20, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VLE "v16, (%0)" : : "r"(c)); + __asm__("vfmul.vf v16, v16, ft0"); + __asm__(VSE "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VLE "v12, (%0)" : : "r"(c)); + __asm__("vfmul.vf v12, v12, ft0"); + __asm__(VSE "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VLE "v8, (%0)" : : "r"(c)); + __asm__("vfmul.vf v8, v8, ft0"); + __asm__(VSE "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VLE "v4, (%0)" : : "r"(c)); + __asm__("vfmul.vf v4, v4, ft0"); + __asm__(VSE "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VLE "v0, (%0)" : : "r"(c)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__(VSE "v0, (%0)" : : "r"(c)); + + } + } // end c unit column stride + else { // c non-unit column stride + switch (M) { + case 7: + __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v24, v24, ft0"); + __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 6: + __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v20, v20, ft0"); + __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 5: + __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v16, v16, ft0"); + __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v12, v12, ft0"); + __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v8, v8, ft0"); + __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v4, v4, ft0"); + __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("vfmul.vf v0, v0, ft0"); + __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } // end c non-unit column stride + } // end beta != 0. + return; +} + +void bli_dgemm_sifive_x280_asm_7m4 + ( + dim_t M, + dim_t N, + dim_t K, + const void* restrict alpha_, + const void* restrict a_, + const void* restrict b_, + const void* restrict beta_, + void* restrict c_, inc_t rsc, inc_t csc, + auxinfo_t* restrict data, + const cntx_t* restrict cntx + ) +{ + (void) data; + (void) cntx; + const double* restrict alpha = alpha_; + const double* restrict beta = beta_; + const double* restrict a = a_; + const double* restrict b = b_; + double* restrict c = c_; + + // M x N x K dgemm + if (M <= 0 || N <= 0 || K < 0) + return; + else if (K == 0) + bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc); + else if (M == 7) + bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); + else + bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLE +#undef VLSE +#undef VSE +#undef VSSE +#undef PACKMR +#undef PACKNR + +// byte-size of underlying floating point type +#define FLT_SIZE 4 +#define FLT_LOAD "flw " +#define VLSEG2 "vlseg2e32.v " +#define VLSSEG2 "vlsseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define PACKMR 8 +#define PACKNR 32 + +void bli_cgemm_6m2 + ( + dim_t N, + dim_t K, + const scomplex* restrict alpha, + const scomplex* restrict a, + const scomplex* restrict b, + const scomplex* restrict beta, + scomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // 6 x N x K cgemm, N <= 32 = vlmax, K > 0 + // pairs of register groups hold the real and imag. parts of rows of c and b + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmul_vf(v0, v2, v24, v26, ft0, ft1); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmul_vf(v4, v6, v24, v26, ft2, ft3); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmul_vf(v8, v10, v24, v26, ft4, ft5); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmul_vf(v12, v14, v24, v26, ft6, ft7); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmul_vf(v16, v18, v24, v26, ft8, ft9); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmul_vf(v20, v22, v24, v26, ft10, ft11); + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + while (K > 0) { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v28, v30); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v28, v30); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v28, v30); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v28, v30); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v28, v30); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmacc_vf(v20, v22, ft10, ft11, v28, v30); + K -= 1; + + if (K == 0) { break; } + + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v24, v26); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v24, v26); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v24, v26); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v24, v26); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v24, v26); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmacc_vf(v20, v22, ft10, ft11, v24, v26); + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); + + __asm__("vfmul.vf v24, v2, ft1"); + __asm__("vfmul.vf v26, v0, ft1"); + __asm__("vfmul.vf v28, v6, ft1"); + __asm__("vfmul.vf v30, v4, ft1"); + + __asm__("vfmsub.vf v0, ft0, v24"); + __asm__("vfmadd.vf v2, ft0, v26"); + __asm__("vfmsub.vf v4, ft0, v28"); + __asm__("vfmadd.vf v6, ft0, v30"); + + __asm__("vfmul.vf v24, v10, ft1"); + __asm__("vfmul.vf v26, v8, ft1"); + __asm__("vfmul.vf v28, v14, ft1"); + __asm__("vfmul.vf v30, v12, ft1"); + + __asm__("vfmsub.vf v8, ft0, v24"); + __asm__("vfmadd.vf v10, ft0, v26"); + __asm__("vfmsub.vf v12, ft0, v28"); + __asm__("vfmadd.vf v14, ft0, v30"); + + __asm__("vfmul.vf v24, v18, ft1"); + __asm__("vfmul.vf v26, v16, ft1"); + __asm__("vfmul.vf v28, v22, ft1"); + __asm__("vfmul.vf v30, v20, ft1"); + + __asm__("vfmsub.vf v16, ft0, v24"); + __asm__("vfmadd.vf v18, ft0, v26"); + __asm__("vfmsub.vf v20, ft0, v28"); + __asm__("vfmadd.vf v22, ft0, v30"); + + scomplex beta_cast = *beta; + if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { + if (csc == 2 * FLT_SIZE) { + scomplex *c_tmp = c; + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + + vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); + } + else { + scomplex *c_tmp = c; + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + + vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); + } + } + + if (csc == 2 * FLT_SIZE) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); + } + else { + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); + } + + return; +} + +void bli_cgemm_6m2_cleanup + ( + dim_t M, + dim_t N, + dim_t K, + const scomplex* restrict alpha, + const scomplex* restrict a, + const scomplex* restrict b, + const scomplex* restrict beta, + scomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 + // pairs of register groups hold the real and imag. parts of rows of c and b + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmul_vf(v16, v18, v24, v26, ft8, ft9); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmul_vf(v12, v14, v24, v26, ft6, ft7); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmul_vf(v8, v10, v24, v26, ft4, ft5); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmul_vf(v4, v6, v24, v26, ft2, ft3); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmul_vf(v0, v2, v24, v26, ft0, ft1); + } + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + while (K > 0) { + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v28, v30); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v28, v30); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v28, v30); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v28, v30); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v28, v30); + } + K -= 1; + + if (K == 0) { break; } + + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v24, v26); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v24, v26); + } + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + c += (M - 1) * rsc; + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); + + switch (M) { + case 5: + __asm__("vfmul.vf v24, v18, ft1"); + __asm__("vfmul.vf v26, v16, ft1"); + __asm__("vfmsub.vf v16, ft0, v24"); + __asm__("vfmadd.vf v18, ft0, v26"); + case 4: + __asm__("vfmul.vf v28, v14, ft1"); + __asm__("vfmul.vf v30, v12, ft1"); + __asm__("vfmsub.vf v12, ft0, v28"); + __asm__("vfmadd.vf v14, ft0, v30"); + case 3: + __asm__("vfmul.vf v24, v10, ft1"); + __asm__("vfmul.vf v26, v8, ft1"); + __asm__("vfmsub.vf v8, ft0, v24"); + __asm__("vfmadd.vf v10, ft0, v26"); + case 2: + __asm__("vfmul.vf v28, v6, ft1"); + __asm__("vfmul.vf v30, v4, ft1"); + __asm__("vfmsub.vf v4, ft0, v28"); + __asm__("vfmadd.vf v6, ft0, v30"); + case 1: + __asm__("vfmul.vf v24, v2, ft1"); + __asm__("vfmul.vf v26, v0, ft1"); + __asm__("vfmsub.vf v0, ft0, v24"); + __asm__("vfmadd.vf v2, ft0, v26"); + } + + scomplex beta_cast = *beta; + if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { + if (csc == 2 * FLT_SIZE) { + scomplex *c_tmp = c; + switch (M) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + case 4: + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + case 2: + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + } + } + else { + scomplex *c_tmp = c; + switch (M) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + case 4: + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + case 2: + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + } + } + } + + if (csc == 2 * FLT_SIZE) { + switch (M) { + case 5: + __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + } + } + else { + switch (M) { + case 5: + __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } + + return; +} + +void bli_cgemm_6m2_k0 + ( + dim_t M, + dim_t N, + const scomplex* restrict beta, + scomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 + // This may not produce the same result as the reference kernel if alpha is infinite or NaN. + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + csc *= 2 * FLT_SIZE; + + scomplex beta_cast = *beta; + if (beta_cast.real == 0.f && beta_cast.imag == 0.f) { + // set c to 0 + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + for (size_t i = 0; i < M; ++i) { + if (csc == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + c += rsc; + } + } + else { + // scale c by beta + for (size_t i = 0; i < M; ++i) { + if (csc == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); + vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + } + c += rsc; + } + } + return; +} + +void bli_cgemm_sifive_x280_asm_6m2 + ( + dim_t M, + dim_t N, + dim_t K, + const void* restrict alpha_, + const void* restrict a_, + const void* restrict b_, + const void* restrict beta_, + void* restrict c_, inc_t rsc, inc_t csc, + auxinfo_t* restrict data, + const cntx_t* restrict cntx + ) +{ + // M x N x K cgemm + (void) data; + (void) cntx; + const scomplex* restrict alpha = alpha_; + const scomplex* restrict beta = beta_; + const scomplex* restrict a = a_; + const scomplex* restrict b = b_; + scomplex* restrict c = c_; + + if (M <= 0 || N <= 0 || K < 0) + return; + else if (K == 0) + bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc); + else if (M == 6) + bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); + else + bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef PACKMR +#undef PACKNR + +// byte-size of underlying floating point type +#define FLT_SIZE 8 +#define FLT_LOAD "fld " +#define VLSEG2 "vlseg2e64.v " +#define VLSSEG2 "vlsseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define PACKMR 8 +#define PACKNR 16 + +void bli_zgemm_6m2 + ( + dim_t N, + dim_t K, + const dcomplex* restrict alpha, + const dcomplex* restrict a, + const dcomplex* restrict b, + const dcomplex* restrict beta, + dcomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // 6 x N x K zgemm, N <= 32 = vlmax, K > 0 + // pairs of register groups hold the real and imag. parts of rows of c and b + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmul_vf(v0, v2, v24, v26, ft0, ft1); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmul_vf(v4, v6, v24, v26, ft2, ft3); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmul_vf(v8, v10, v24, v26, ft4, ft5); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmul_vf(v12, v14, v24, v26, ft6, ft7); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmul_vf(v16, v18, v24, v26, ft8, ft9); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmul_vf(v20, v22, v24, v26, ft10, ft11); + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + while (K > 0) { + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v28, v30); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v28, v30); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v28, v30); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v28, v30); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v28, v30); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmacc_vf(v20, v22, ft10, ft11, v28, v30); + K -= 1; + + if (K == 0) { break; } + + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v24, v26); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v24, v26); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v24, v26); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v24, v26); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v24, v26); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); + vcmacc_vf(v20, v22, ft10, ft11, v24, v26); + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); + + __asm__("vfmul.vf v24, v2, ft1"); + __asm__("vfmul.vf v26, v0, ft1"); + __asm__("vfmul.vf v28, v6, ft1"); + __asm__("vfmul.vf v30, v4, ft1"); + + __asm__("vfmsub.vf v0, ft0, v24"); + __asm__("vfmadd.vf v2, ft0, v26"); + __asm__("vfmsub.vf v4, ft0, v28"); + __asm__("vfmadd.vf v6, ft0, v30"); + + __asm__("vfmul.vf v24, v10, ft1"); + __asm__("vfmul.vf v26, v8, ft1"); + __asm__("vfmul.vf v28, v14, ft1"); + __asm__("vfmul.vf v30, v12, ft1"); + + __asm__("vfmsub.vf v8, ft0, v24"); + __asm__("vfmadd.vf v10, ft0, v26"); + __asm__("vfmsub.vf v12, ft0, v28"); + __asm__("vfmadd.vf v14, ft0, v30"); + + __asm__("vfmul.vf v24, v18, ft1"); + __asm__("vfmul.vf v26, v16, ft1"); + __asm__("vfmul.vf v28, v22, ft1"); + __asm__("vfmul.vf v30, v20, ft1"); + + __asm__("vfmsub.vf v16, ft0, v24"); + __asm__("vfmadd.vf v18, ft0, v26"); + __asm__("vfmsub.vf v20, ft0, v28"); + __asm__("vfmadd.vf v22, ft0, v30"); + + dcomplex beta_cast = *beta; + if (beta_cast.real != 0. || beta_cast.imag != 0.) { + if (csc == 2 * FLT_SIZE) { + dcomplex *c_tmp = c; + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + + vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); + } + else { + dcomplex *c_tmp = c; + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + + vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); + } + } + + if (csc == 2 * FLT_SIZE) { + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); + } + else { + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); + __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); + } + + return; +} + +void bli_zgemm_6m2_cleanup + ( + dim_t M, + dim_t N, + dim_t K, + const dcomplex* restrict alpha, + const dcomplex* restrict a, + const dcomplex* restrict b, + const dcomplex* restrict beta, + dcomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 + // pairs of register groups hold the real and imag. parts of rows of c and b + + __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmul_vf(v16, v18, v24, v26, ft8, ft9); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmul_vf(v12, v14, v24, v26, ft6, ft7); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmul_vf(v8, v10, v24, v26, ft4, ft5); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmul_vf(v4, v6, v24, v26, ft2, ft3); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmul_vf(v0, v2, v24, v26, ft0, ft1); + } + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + while (K > 0) { + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v28, v30); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v28, v30); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v28, v30); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v28, v30); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v28, v30); + } + K -= 1; + + if (K == 0) { break; } + + if (K >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + + switch (M) { + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); + vcmacc_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); + vcmacc_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); + vcmacc_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); + vcmacc_vf(v4, v6, ft2, ft3, v24, v26); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); + vcmacc_vf(v0, v2, ft0, ft1, v24, v26); + } + K -= 1; + + if (K >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); + __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + c += (M - 1) * rsc; + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); + + switch (M) { + case 5: + __asm__("vfmul.vf v24, v18, ft1"); + __asm__("vfmul.vf v26, v16, ft1"); + __asm__("vfmsub.vf v16, ft0, v24"); + __asm__("vfmadd.vf v18, ft0, v26"); + case 4: + __asm__("vfmul.vf v28, v14, ft1"); + __asm__("vfmul.vf v30, v12, ft1"); + __asm__("vfmsub.vf v12, ft0, v28"); + __asm__("vfmadd.vf v14, ft0, v30"); + case 3: + __asm__("vfmul.vf v24, v10, ft1"); + __asm__("vfmul.vf v26, v8, ft1"); + __asm__("vfmsub.vf v8, ft0, v24"); + __asm__("vfmadd.vf v10, ft0, v26"); + case 2: + __asm__("vfmul.vf v28, v6, ft1"); + __asm__("vfmul.vf v30, v4, ft1"); + __asm__("vfmsub.vf v4, ft0, v28"); + __asm__("vfmadd.vf v6, ft0, v30"); + case 1: + __asm__("vfmul.vf v24, v2, ft1"); + __asm__("vfmul.vf v26, v0, ft1"); + __asm__("vfmsub.vf v0, ft0, v24"); + __asm__("vfmadd.vf v2, ft0, v26"); + } + + dcomplex beta_cast = *beta; + if (beta_cast.real != 0. || beta_cast.imag != 0.) { + if (csc == 2 * FLT_SIZE) { + dcomplex *c_tmp = c; + switch (M) { + case 5: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + case 4: + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + case 3: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + case 2: + __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + case 1: + __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + } + } + else { + dcomplex *c_tmp = c; + switch (M) { + case 5: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); + case 4: + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); + case 3: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); + case 2: + __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); + vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); + case 1: + __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); + vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); + } + } + } + + if (csc == 2 * FLT_SIZE) { + switch (M) { + case 5: + __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + } + } + else { + switch (M) { + case 5: + __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 4: + __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 3: + __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 2: + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); + case 1: + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + } + } + + return; +} + +void bli_zgemm_6m2_k0 + ( + dim_t M, + dim_t N, + const dcomplex* restrict beta, + dcomplex* restrict c, inc_t rsc, inc_t csc + ) +{ + // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 + // This may not produce the same result as the reference kernel if alpha is infinite or NaN. + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); + csc *= 2 * FLT_SIZE; + + dcomplex beta_cast = *beta; + if (beta_cast.real == 0. && beta_cast.imag == 0.) { + // set c to 0 + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + for (size_t i = 0; i < M; ++i) { + if (csc == 2 * FLT_SIZE) + __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); + else + __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + c += rsc; + } + } + else { + // scale c by beta + for (size_t i = 0; i < M; ++i) { + if (csc == 2 * FLT_SIZE) { + __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); + vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); + __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); + } + else { + __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); + vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); + __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); + } + c += rsc; + } + } + return; +} + +void bli_zgemm_sifive_x280_asm_6m2 + ( + dim_t M, + dim_t N, + dim_t K, + const void* restrict alpha_, + const void* restrict a_, + const void* restrict b_, + const void* restrict beta_, + void* restrict c_, inc_t rsc, inc_t csc, + auxinfo_t* restrict data, + const cntx_t* restrict cntx + ) +{ + // M x N x K zgemm + (void) data; + (void) cntx; + const dcomplex* restrict alpha = alpha_; + const dcomplex* restrict beta = beta_; + const dcomplex* restrict a = a_; + const dcomplex* restrict b = b_; + dcomplex* restrict c = c_; + + if (M <= 0 || N <= 0 || K < 0) + return; + else if (K == 0) + bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc); + else if (M == 6) + bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); + else + bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); + return; +} + +#undef FLT_SIZE +#undef FLT_LOAD +#undef VLSEG2 +#undef VLSSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef PACKMR +#undef PACKNR diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c new file mode 100644 index 0000000000..18df010d05 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_complex.c @@ -0,0 +1,327 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + (void) data; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + const DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + if (m <= 0 || n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + + DATATYPE alpha_cast = *alpha; + if (alpha_cast.real == 0 && alpha_cast.imag == 0) { + switch (m) { + case 6: + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v22, 0"); + case 5: + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v18, 0"); + case 4: + __asm__("vmv.v.i v12, 0"); + __asm__("vmv.v.i v14, 0"); + case 3: + __asm__("vmv.v.i v8, 0"); + __asm__("vmv.v.i v10, 0"); + case 2: + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v6, 0"); + case 1: + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + } + } + else { + const DATATYPE* b11_tmp = b11 + (m - 1) * PACKNR; + switch (m) { + case 6: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); + case 5: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); + case 3: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(-PACKNR * 2 * FLT_SIZE)); + case 1: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); + } + } + + if (k >= 1) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); + __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); + } + if (k >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); + __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + while (k > 0) { + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); + vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); + } + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b01)); + __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a10), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a10), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a10), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a10), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a10), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a10), "I"(7 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a10), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a10), "I"(5 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a10), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a10), "I"(3 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a10), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a10), "I"(1 * FLT_SIZE)); + vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); + } + k -= 1; + + if (k >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b01)); + __asm__("addi %0, %0, %1" : "+r"(b01) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a10) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); + vcmul_vf(v24, v26, v0, v2, ft0, ft1); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 1) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); + __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(3 * FLT_SIZE)); + vcmul_vf(v24, v26, v4, v6, ft2, ft3); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 2) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); + __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(5 * FLT_SIZE)); + vcmul_vf(v24, v26, v8, v10, ft4, ft5); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 3) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); + __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(7 * FLT_SIZE)); + vcmul_vf(v24, v26, v12, v14, ft6, ft7); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 4) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); + __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(9 * FLT_SIZE)); + vcmul_vf(v24, v26, v16, v18, ft8, ft9); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 5) return; + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(PACKNR * 2 * FLT_SIZE)); + __asm__("add %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(11 * FLT_SIZE)); + vcmul_vf(v24, v26, v20, v22, ft10, ft11); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + return; +} + +#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c new file mode 100644 index 0000000000..a0f9134731 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_l_sifive_x280_asm_real.c @@ -0,0 +1,253 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +GEMMTRSM(GEMMTRSM_L, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a10 = a10_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b01 = b01_; + const DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) + return; + + dim_t b11_offset, temp; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); + + // Multiply step sizes by data size + __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); + __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); + + __asm__("addi %0, %1, %2": "=r"(b11_offset): "r"(m), "I"(-1)); + __asm__("li %0, %1": "=r"(temp): "I"(PACKNR * FLT_SIZE)); + __asm__("mul %0, %0, %1": "+r"(b11_offset): "r"(temp)); + // b11_offset = (m-1)*PACKNR*FLT_SIZE + + __asm__("add %0, %0, %1": "+r"(b11): "r"(b11_offset)); + __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); // TO DO: optimize alpha = 1 case + switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha + case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); + __asm__("vfmul.vf v4, v4, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); + __asm__("vfmul.vf v8, v8, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); + __asm__("vfmul.vf v12, v12, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); + __asm__("vfmul.vf v16, v16, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); + __asm__("vfmul.vf v20, v20, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); + __asm__("vfmul.vf v24, v24, f0"); + // no sub of b11 on final entry + } + // b11 now reset to original value + // v0 = row 6 of b11 + // v4 = row 5 of b11 + // v8 = row 4 of b11 + // v12 = row 3 of b11 + // v16 = row 2 of b11 + // v20 = row 1 of b11 + // v24 = row 0 of b11 + + // GEMM: B11 := alpha * B11 - A10 * B01 + for (dim_t i = 0; i < k; i++){ + __asm__(VLE " v28, (%0)": : "r"(b01)); // kth row of b01 + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(6*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v0, f6, v28"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(5*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v4, f5, v28"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(4*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v8, f4, v28"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(3*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v12, f3, v28"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(2*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v16, f2, v28"); + case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(1*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v20, f1, v28"); + case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(0*FLT_SIZE), "r"(a10)); + __asm__("vfnmsac.vf v24, f0, v28"); + } + __asm__("addi %0, %0, %1": "+r"(a10): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b01): "I"(PACKNR * FLT_SIZE)); + } + // TRSM: B11 := inv(A11) * B11 + // TO DO: Investigate code size reduction (loop rerolling) + + // Row 0 + __asm__(FLT_LOAD " f0, %0(%1)": : "I"(0*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v24, v24, f0"); + __asm__(VSE " v24, (%0)": : "r"(b11)); + __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 1) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v24"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v24"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v24"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v24"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v16, f2, v24"); + case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v20, f1, v24"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 1 + __asm__(FLT_LOAD " f1, %0(%1)": : "I"(1*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v20, v20, f1"); + __asm__(VSE " v20, (%0)": : "r"(b11)); + __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 2) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v20"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v20"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v20"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v20"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v16, f2, v20"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 2 + __asm__(FLT_LOAD " f2, %0(%1)": : "I"(2*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v16, v16, f2"); + __asm__(VSE " v16, (%0)": : "r"(b11)); + __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 3) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v16"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v16"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v16"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v16"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 3 + __asm__(FLT_LOAD " f3, %0(%1)": : "I"(3*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v12, v12, f3"); + __asm__(VSE " v12, (%0)": : "r"(b11)); + __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 4) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v12"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v12"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v12"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 4 + __asm__(FLT_LOAD " f4, %0(%1)": : "I"(4*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v8, v8, f4"); + __asm__(VSE " v8, (%0)": : "r"(b11)); + __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 5) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v8"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v8"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 5 + __asm__(FLT_LOAD " f5, %0(%1)": : "I"(5*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v4, v4, f5"); + __asm__(VSE " v4, (%0)": : "r"(b11)); + __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 6) return; + + __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v4"); + + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 6 + __asm__(FLT_LOAD " f6, %0(%1)": : "I"(6*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v0, v0, f6"); + __asm__(VSE " v0, (%0)": : "r"(b11)); + __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); +} +#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c new file mode 100644 index 0000000000..4323f8fbf6 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_sifive_x280_asm.c @@ -0,0 +1,182 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#include "blis.h" +#include "../../riscv_cmul_macros_asm.h" +#include +#include + +#define GEMMTRSM_L(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_l_sifive_x280_asm(\ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const T* restrict alpha_, \ + const T* restrict a10_, \ + const T* restrict a11_, \ + const T* restrict b01_, \ + T* restrict b11_, \ + T* restrict c11_, \ + inc_t rsc, \ + inc_t csc, \ + auxinfo_t* restrict data, \ + const cntx_t* restrict cntx \ + ) + +#define GEMMTRSM_U(PRECISION_CHAR, T) void bli_##PRECISION_CHAR##gemmtrsm_u_sifive_x280_asm(\ + dim_t m, \ + dim_t n, \ + dim_t k, \ + const T* restrict alpha_, \ + const T* restrict a12_, \ + const T* restrict a11_, \ + const T* restrict b21_, \ + T* restrict b11_, \ + T* restrict c11_, \ + inc_t rsc, \ + inc_t csc, \ + auxinfo_t* restrict data, \ + const cntx_t* restrict cntx \ + ) + +#define GEMMTRSM(macro, ...) macro(__VA_ARGS__) + +// Single precision real +#define DATATYPE float +#define PRECISION_CHAR s +#define PACKMR 8 +#define PACKNR 64 +#define VLE "vle32.v" +#define VSE "vse32.v" +#define VSSE "vsse32.v" +#define FLT_LOAD "flw" +#define FLT_SIZE sizeof(float) +#define LOG_FLT_SIZE 2 + + +#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PACKMR +#undef PACKNR +#undef VLE +#undef VSE +#undef VSSE +#undef FLT_LOAD +#undef FLT_SIZE +#undef LOG_FLT_SIZE + +// Double precision real +#define DATATYPE double +#define PRECISION_CHAR d +#define PACKMR 8 +#define PACKNR 32 +#define VLE "vle64.v" +#define VSE "vse64.v" +#define VSSE "vsse64.v" +#define FLT_LOAD "fld" +#define FLT_SIZE sizeof(double) +#define LOG_FLT_SIZE 3 + +#include "./bli_gemmtrsm_l_sifive_x280_asm_real.c" +#include "./bli_gemmtrsm_u_sifive_x280_asm_real.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PACKMR +#undef PACKNR +#undef VLE +#undef VSE +#undef VSSE +#undef FLT_LOAD +#undef FLT_SIZE +#undef LOG_FLT_SIZE + +// Single precision complex +#define DATATYPE scomplex +#define PRECISION_CHAR c +#define PACKMR 8 +#define PACKNR 32 +#define VLSEG2 "vlseg2e32.v " +#define VSSEG2 "vsseg2e32.v " +#define VSSSEG2 "vssseg2e32.v " +#define FLT_LOAD "flw " +#define FLT_SIZE sizeof(float) + +#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PACKMR +#undef PACKNR +#undef VLSEG2 +#undef VSSEG2 +#undef VSSSEG2 +#undef FLT_LOAD +#undef FLT_SIZE + +// Double precision complex +#define DATATYPE dcomplex +#define PRECISION_CHAR z +#define PACKMR 8 +#define PACKNR 16 +#define VLSEG2 "vlseg2e64.v " +#define VSSEG2 "vsseg2e64.v " +#define VSSSEG2 "vssseg2e64.v " +#define FLT_LOAD "fld " +#define FLT_SIZE sizeof(double) + +#include "./bli_gemmtrsm_l_sifive_x280_asm_complex.c" +#include "./bli_gemmtrsm_u_sifive_x280_asm_complex.c" + +#undef DATATYPE +#undef PRECISION_CHAR +#undef PACKMR +#undef PACKNR +#undef VLSEG +#undef VSSEG +#undef VSSSEG +#undef FLT_LOAD +#undef FLT_SIZE + + + +#undef GEMMTRSM +#undef GEMMTRSM_L +#undef GEMMTRSM_U + + diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c new file mode 100644 index 0000000000..9332fd0963 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_complex.c @@ -0,0 +1,331 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + (void) data; + (void) cntx; + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + const DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + if (m <= 0 || n <= 0) + return; + + __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(n), "i"(8 * FLT_SIZE)); + + DATATYPE alpha_cast = *alpha; + if (alpha_cast.real == 0 && alpha_cast.imag == 0) { + switch (m) { + case 6: + __asm__("vmv.v.i v20, 0"); + __asm__("vmv.v.i v22, 0"); + case 5: + __asm__("vmv.v.i v16, 0"); + __asm__("vmv.v.i v18, 0"); + case 4: + __asm__("vmv.v.i v12, 0"); + __asm__("vmv.v.i v14, 0"); + case 3: + __asm__("vmv.v.i v8, 0"); + __asm__("vmv.v.i v10, 0"); + case 2: + __asm__("vmv.v.i v4, 0"); + __asm__("vmv.v.i v6, 0"); + case 1: + __asm__("vmv.v.i v0, 0"); + __asm__("vmv.v.i v2, 0"); + } + } + else { + const DATATYPE* b11_tmp = b11; + switch (m) { + case 6: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v20, v22, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); + case 5: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v16, v18, v28, v30, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); + case 4: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v12, v14, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); + case 3: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v8, v10, v28, v30, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); + case 2: + __asm__(VLSEG2 "v24, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v4, v6, v24, v26, alpha_cast.real, alpha_cast.imag); + __asm__("addi %0, %0, %1" : "+r"(b11_tmp) : "I"(PACKNR * 2 * FLT_SIZE)); + case 1: + __asm__(VLSEG2 "v28, (%0)" : : "r"(b11_tmp)); + vcmul_vf2(v0, v2, v28, v30, alpha_cast.real, alpha_cast.imag); + } + } + + if (k >= 1) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); + __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); + } + if (k >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); + __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); + } + + a12 += m - 1; + + while (k > 0) { + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); + vcnmsac_vf(v0, v2, ft0, ft1, v24, v26); + } + k -= 1; + + if (k == 0) { break; } + + if (k >= 2) { + __asm__(VLSEG2 "v24, (%0)" : : "r"(b21)); + __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a12), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a12), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v28, v30); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a12), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a12), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v28, v30); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a12), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a12), "I"(-5 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v28, v30); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a12), "I"(-4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a12), "I"(-3 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v28, v30); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a12), "I"(-2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a12), "I"(-1 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v28, v30); + case 1: + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a12), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a12), "I"(1 * FLT_SIZE)); + vcnmsac_vf(v0, v2, ft0, ft1, v28, v30); + } + k -= 1; + + if (k >= 2) { + __asm__(VLSEG2 "v28, (%0)" : : "r"(b21)); + __asm__("addi %0, %0, %1" : "+r"(b21) : "I"(PACKNR * 2 * FLT_SIZE)); + } + __asm__("addi %0, %0, %1" : "+r"(a12) : "I"(PACKMR * 2 * FLT_SIZE)); + } + + a11 += (m - 1) * (PACKMR + 1); // (m - 1) + (m - 1) * PACKMR + b11 += (m - 1) * PACKNR; + c11 += (m - 1) * rsc; + rsc *= 2 * FLT_SIZE; + csc *= 2 * FLT_SIZE; + + __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a11), "I"(0 * FLT_SIZE)); + __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a11), "I"(1 * FLT_SIZE)); + vcmul_vf(v24, v26, v0, v2, ft0, ft1); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 1) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + case 2: + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); + vcnmsac_vf(v4, v6, ft2, ft3, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); + __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a11), "I"(-2 * FLT_SIZE)); + __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a11), "I"(-1 * FLT_SIZE)); + vcmul_vf(v24, v26, v4, v6, ft2, ft3); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 2) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + case 3: + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); + vcnmsac_vf(v8, v10, ft4, ft5, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); + __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a11), "I"(-4 * FLT_SIZE)); + __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a11), "I"(-3 * FLT_SIZE)); + vcmul_vf(v24, v26, v8, v10, ft4, ft5); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 3) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + case 4: + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); + vcnmsac_vf(v12, v14, ft6, ft7, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); + __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a11), "I"(-6 * FLT_SIZE)); + __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a11), "I"(-5 * FLT_SIZE)); + vcmul_vf(v24, v26, v12, v14, ft6, ft7); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 4) return; + + switch (m) { + case 6: + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + case 5: + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); + vcnmsac_vf(v16, v18, ft8, ft9, v24, v26); + } + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); + __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a11), "I"(-8 * FLT_SIZE)); + __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a11), "I"(-7 * FLT_SIZE)); + vcmul_vf(v24, v26, v16, v18, ft8, ft9); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + if (m == 5) return; + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcnmsac_vf(v20, v22, ft10, ft11, v24, v26); + + __asm__("addi %0, %0, %1" : "+r"(a11) : "I"(-PACKMR * 2 * FLT_SIZE)); + __asm__("addi %0, %0, %1" : "+r"(b11) : "I"(-PACKNR * 2 * FLT_SIZE)); + __asm__("sub %0, %0, %1" : "+r"(c11) : "r"(rsc)); + + __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a11), "I"(-10 * FLT_SIZE)); + __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a11), "I"(-9 * FLT_SIZE)); + vcmul_vf(v24, v26, v20, v22, ft10, ft11); + __asm__(VSSEG2 "v24, (%0)" : : "r"(b11)); + __asm__(VSSSEG2 "v24, (%0), %1" : : "r"(c11), "r"(csc)); + + return; +} +#endif diff --git a/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c new file mode 100644 index 0000000000..2d511a8ba6 --- /dev/null +++ b/kernels/sifive_x280/3/bli_gemmtrsm_sifive_x280_asm/bli_gemmtrsm_u_sifive_x280_asm_real.c @@ -0,0 +1,260 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// clang-format off +#ifdef GEMMTRSM + +GEMMTRSM(GEMMTRSM_U, PRECISION_CHAR, void) +{ + const DATATYPE* restrict alpha = alpha_; + const DATATYPE* restrict a12 = a12_; + const DATATYPE* restrict a11 = a11_; + const DATATYPE* restrict b21 = b21_; + const DATATYPE* restrict b11 = b11_; + DATATYPE* restrict c11 = c11_; + + if (!(1 <= m && m <= PACKMR && 1 <= n && n <= PACKNR)) + return; + + dim_t m_sz, a11_offset, c11_offset, temp; + size_t vl; + __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma": "=r"(vl) : "r"(n), "i"(8*FLT_SIZE)); + + // Multiply step sizes by data size + __asm__("slli %0, %0, %1": "+r"(rsc) : "I"(LOG_FLT_SIZE)); + __asm__("slli %0, %0, %1": "+r"(csc) : "I"(LOG_FLT_SIZE)); + __asm__("slli %0, %1, %2": "=r"(m_sz) : "r"(m), "I"(LOG_FLT_SIZE)); + + __asm__("li %0, %1": "=r"(temp): "I"((PACKMR+1)*FLT_SIZE)); + __asm__("mul %0, %1, %2": "=r"(a11_offset) : "r"(m), "r"(temp)); + __asm__("addi %0, %0, %1": "+r"(a11_offset) : "I"(-PACKMR * FLT_SIZE)); + __asm__("mul %0, %1, %2": "=r"(c11_offset) : "r"(m), "r"(rsc)); + __asm__("sub %0, %0, %1": "+r"(c11_offset) : "r"(rsc)); + // a11_offset = (PACKMR*(m-1)+m)*sz = m*(PACKMR+1)*FLT_SIZE - PACKMR*FLT_SIZE + // c11_offset = rsc*(m-1)*sz + + __asm__(FLT_LOAD " f0, (%0)" : : "r"(alpha)); + switch (m){ // Vector loads from b11 with Duff device, multiplying by alpha + case 7: __asm__(VLE " v0, (%0)": : "r"(b11)); + __asm__("vfmul.vf v0, v0, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 6: __asm__(VLE " v4, (%0)": : "r"(b11)); + __asm__("vfmul.vf v4, v4, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 5: __asm__(VLE " v8, (%0)": : "r"(b11)); + __asm__("vfmul.vf v8, v8, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 4: __asm__(VLE " v12, (%0)": : "r"(b11)); + __asm__("vfmul.vf v12, v12, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 3: __asm__(VLE " v16, (%0)": : "r"(b11)); + __asm__("vfmul.vf v16, v16, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 2: __asm__(VLE " v20, (%0)": : "r"(b11)); + __asm__("vfmul.vf v20, v20, f0"); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(PACKNR * FLT_SIZE)); + case 1: __asm__(VLE " v24, (%0)": : "r"(b11)); + __asm__("vfmul.vf v24, v24, f0"); + // no add of b11 on final entry + } + // b11 now positioned at start of last row + // v24 = row 0 from bottom (bottom row) + // v20 = row 1 from bottom + // v16 = row 2 from bottom + // v12 = row 3 from bottom + // v8 = row 4 from bottom + // v4 = row 5 from bottom + // v0 = row 6 from bottom + + // GEMM: B11 := alpha * B11 - A12 * B21 + __asm__("add %0, %0, %1": "+r"(a12): "r"(m_sz)); + for (dim_t i = 0; i < k; i++){ + __asm__(VLE " v28, (%0)": : "r"(b21)); // kth row of b21 + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)" : : "I"(-7*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v0, f6, v28"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)" : : "I"(-6*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v4, f5, v28"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)" : : "I"(-5*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v8, f4, v28"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)" : : "I"(-4*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v12, f3, v28"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)" : : "I"(-3*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v16, f2, v28"); + case 2: __asm__(FLT_LOAD " f1, %0(%1)" : : "I"(-2*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v20, f1, v28"); + case 1: __asm__(FLT_LOAD " f0, %0(%1)" : : "I"(-1*FLT_SIZE), "r"(a12)); + __asm__("vfnmsac.vf v24, f0, v28"); + } + __asm__("addi %0, %0, %1": "+r"(a12): "I"(PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b21): "I"(PACKNR * FLT_SIZE)); + } + // TRSM: B11 := inv(A11) * B11 + // Move a11 to end of array and c11 to first entry in last row + __asm__("add %0, %0, %1": "+r"(a11): "r"(a11_offset)); + __asm__("add %0, %0, %1": "+r"(c11): "r"(c11_offset)); + + // Row 0 from bottom (bottom row) + __asm__(FLT_LOAD " f0, %0(%1)": : "I"(-1*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v24, v24, f0"); + __asm__(VSE " v24, (%0)": : "r"(b11)); + __asm__(VSSE " v24, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 1) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v24"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v24"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v24"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v24"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v16, f2, v24"); + case 2: __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v20, f1, v24"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 1 from bottom + __asm__(FLT_LOAD " f1, %0(%1)": : "I"(-2*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v20, v20, f1"); + __asm__(VSE " v20, (%0)": : "r"(b11)); + __asm__(VSSE " v20, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 2) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v20"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v20"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v20"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v20"); + case 3: __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v16, f2, v20"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 2 from bottom + __asm__(FLT_LOAD " f2, %0(%1)": : "I"(-3*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v16, v16, f2"); + __asm__(VSE " v16, (%0)": : "r"(b11)); + __asm__(VSSE " v16, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 3) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v16"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v16"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v16"); + case 4: __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v12, f3, v16"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 3 from bottom + __asm__(FLT_LOAD " f3, %0(%1)": : "I"(-4*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v12, v12, f3"); + __asm__(VSE " v12, (%0)": : "r"(b11)); + __asm__(VSSE " v12, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 4) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v12"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v12"); + case 5: __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v8, f4, v12"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 4 from bottom + __asm__(FLT_LOAD " f4, %0(%1)": : "I"(-5*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v8, v8, f4"); + __asm__(VSE " v8, (%0)": : "r"(b11)); + __asm__(VSSE " v8, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 5) return; + + switch (m){ + case 7: __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v8"); + case 6: __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v4, f5, v8"); + } + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 5 from bottom + __asm__(FLT_LOAD " f5, %0(%1)": : "I"(-6*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v4, v4, f5"); + __asm__(VSE " v4, (%0)": : "r"(b11)); + __asm__(VSSE " v4, (%0), %1": : "r"(c11), "r"(csc)); + if (m == 6) return; + + __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfnmsac.vf v0, f6, v4"); + + // Pointer bumps + __asm__("addi %0, %0, %1": "+r"(a11): "I"(-PACKMR * FLT_SIZE)); + __asm__("addi %0, %0, %1": "+r"(b11): "I"(-PACKNR * FLT_SIZE)); + __asm__("sub %0, %0, %1": "+r"(c11): "r"(rsc)); + + // Row 6 from bottom + __asm__(FLT_LOAD " f6, %0(%1)": : "I"(-7*FLT_SIZE), "r"(a11)); + __asm__("vfmul.vf v0, v0, f6"); + __asm__(VSE " v0, (%0)": : "r"(b11)); + __asm__(VSSE " v0, (%0), %1": : "r"(c11), "r"(csc)); + +} +#endif diff --git a/kernels/sifive_x280/bli_kernels_sifive_x280.h b/kernels/sifive_x280/bli_kernels_sifive_x280.h new file mode 100644 index 0000000000..425c7dad92 --- /dev/null +++ b/kernels/sifive_x280/bli_kernels_sifive_x280.h @@ -0,0 +1,160 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// Level 1 +ADDV_KER_PROT(float, s, addv_sifive_x280_intr) +ADDV_KER_PROT(double, d, addv_sifive_x280_intr) +ADDV_KER_PROT(scomplex, c, addv_sifive_x280_intr) +ADDV_KER_PROT(dcomplex, z, addv_sifive_x280_intr) + +AMAXV_KER_PROT(float, s, amaxv_sifive_x280_asm) +AMAXV_KER_PROT(double, d, amaxv_sifive_x280_asm) +AMAXV_KER_PROT(scomplex, c, amaxv_sifive_x280_asm) +AMAXV_KER_PROT(dcomplex, z, amaxv_sifive_x280_asm) + +AXPBYV_KER_PROT(float, s, axpbyv_sifive_x280_intr) +AXPBYV_KER_PROT(double, d, axpbyv_sifive_x280_intr) +AXPBYV_KER_PROT(scomplex, c, axpbyv_sifive_x280_intr) +AXPBYV_KER_PROT(dcomplex, z, axpbyv_sifive_x280_intr) + +AXPYV_KER_PROT(float, s, axpyv_sifive_x280_intr) +AXPYV_KER_PROT(double, d, axpyv_sifive_x280_intr) +AXPYV_KER_PROT(scomplex, c, axpyv_sifive_x280_intr) +AXPYV_KER_PROT(dcomplex, z, axpyv_sifive_x280_intr) + +COPYV_KER_PROT(float, s, copyv_sifive_x280_asm) +COPYV_KER_PROT(double, d, copyv_sifive_x280_asm) +COPYV_KER_PROT(scomplex, c, copyv_sifive_x280_asm) +COPYV_KER_PROT(dcomplex, z, copyv_sifive_x280_asm) + +DOTV_KER_PROT(float, s, dotv_sifive_x280_intr) +DOTV_KER_PROT(double, d, dotv_sifive_x280_intr) +DOTV_KER_PROT(scomplex, c, dotv_sifive_x280_intr) +DOTV_KER_PROT(dcomplex, z, dotv_sifive_x280_intr) + +DOTXV_KER_PROT(float, s, dotxv_sifive_x280_intr) +DOTXV_KER_PROT(double, d, dotxv_sifive_x280_intr) +DOTXV_KER_PROT(scomplex, c, dotxv_sifive_x280_intr) +DOTXV_KER_PROT(dcomplex, z, dotxv_sifive_x280_intr) + +INVERTV_KER_PROT(float, s, invertv_sifive_x280_asm) +INVERTV_KER_PROT(double, d, invertv_sifive_x280_asm) +INVERTV_KER_PROT(scomplex, c, invertv_sifive_x280_asm) +INVERTV_KER_PROT(dcomplex, z, invertv_sifive_x280_asm) + +INVSCALV_KER_PROT(float, s, invscalv_sifive_x280_asm) +INVSCALV_KER_PROT(double, d, invscalv_sifive_x280_asm) +INVSCALV_KER_PROT(scomplex, c, invscalv_sifive_x280_asm) +INVSCALV_KER_PROT(dcomplex, z, invscalv_sifive_x280_asm) + +SCAL2V_KER_PROT(float, s, scal2v_sifive_x280_intr) +SCAL2V_KER_PROT(double, d, scal2v_sifive_x280_intr) +SCAL2V_KER_PROT(scomplex, c, scal2v_sifive_x280_intr) +SCAL2V_KER_PROT(dcomplex, z, scal2v_sifive_x280_intr) + +SCALV_KER_PROT(float, s, scalv_sifive_x280_intr) +SCALV_KER_PROT(double, d, scalv_sifive_x280_intr) +SCALV_KER_PROT(scomplex, c, scalv_sifive_x280_intr) +SCALV_KER_PROT(dcomplex, z, scalv_sifive_x280_intr) + +SETV_KER_PROT(float, s, setv_sifive_x280_asm) +SETV_KER_PROT(double, d, setv_sifive_x280_asm) +SETV_KER_PROT(scomplex, c, setv_sifive_x280_asm) +SETV_KER_PROT(dcomplex, z, setv_sifive_x280_asm) + +SUBV_KER_PROT(float, s, subv_sifive_x280_intr) +SUBV_KER_PROT(double, d, subv_sifive_x280_intr) +SUBV_KER_PROT(scomplex, c, subv_sifive_x280_intr) +SUBV_KER_PROT(dcomplex, z, subv_sifive_x280_intr) + +SWAPV_KER_PROT(float, s, swapv_sifive_x280_asm) +SWAPV_KER_PROT(double, d, swapv_sifive_x280_asm) +SWAPV_KER_PROT(scomplex, c, swapv_sifive_x280_asm) +SWAPV_KER_PROT(dcomplex, z, swapv_sifive_x280_asm) + +XPBYV_KER_PROT(float, s, xpbyv_sifive_x280_intr) +XPBYV_KER_PROT(double, d, xpbyv_sifive_x280_intr) +XPBYV_KER_PROT(scomplex, c, xpbyv_sifive_x280_intr) +XPBYV_KER_PROT(dcomplex, z, xpbyv_sifive_x280_intr) + +// Level 1f +AXPY2V_KER_PROT(float, s, axpy2v_sifive_x280_intr) +AXPY2V_KER_PROT(double, d, axpy2v_sifive_x280_intr) +AXPY2V_KER_PROT(scomplex, c, axpy2v_sifive_x280_intr) +AXPY2V_KER_PROT(dcomplex, z, axpy2v_sifive_x280_intr) + +AXPYF_KER_PROT(float, s, axpyf_sifive_x280_asm) +AXPYF_KER_PROT(double, d, axpyf_sifive_x280_asm) +AXPYF_KER_PROT(scomplex, c, axpyf_sifive_x280_asm) +AXPYF_KER_PROT(dcomplex, z, axpyf_sifive_x280_asm) + +DOTXF_KER_PROT(float, s, dotxf_sifive_x280_asm) +DOTXF_KER_PROT(double, d, dotxf_sifive_x280_asm) +DOTXF_KER_PROT(scomplex, c, dotxf_sifive_x280_asm) +DOTXF_KER_PROT(dcomplex, z, dotxf_sifive_x280_asm) + +DOTAXPYV_KER_PROT(float, s, dotaxpyv_sifive_x280_intr) +DOTAXPYV_KER_PROT(double, d, dotaxpyv_sifive_x280_intr) +DOTAXPYV_KER_PROT(scomplex, c, dotaxpyv_sifive_x280_intr) +DOTAXPYV_KER_PROT(dcomplex, z, dotaxpyv_sifive_x280_intr) + +DOTXAXPYF_KER_PROT(float, s, dotxaxpyf_sifive_x280_asm) +DOTXAXPYF_KER_PROT(double, d, dotxaxpyf_sifive_x280_asm) +DOTXAXPYF_KER_PROT(scomplex,c, dotxaxpyf_sifive_x280_asm) +DOTXAXPYF_KER_PROT(dcomplex,z, dotxaxpyf_sifive_x280_asm) + +// Level 1m +PACKM_KER_PROT(float, s, packm_sifive_x280_asm_7xk) +PACKM_KER_PROT(double, d, packm_sifive_x280_asm_7xk) +PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_6xk) +PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_6xk) +PACKM_KER_PROT(float, s, packm_sifive_x280_asm_64xk) +PACKM_KER_PROT(double, d, packm_sifive_x280_asm_32xk) +PACKM_KER_PROT(scomplex, c, packm_sifive_x280_asm_32xk) +PACKM_KER_PROT(dcomplex, z, packm_sifive_x280_asm_16xk) + +// Level 3 +GEMM_UKR_PROT(float, s, gemm_sifive_x280_asm_7m4) +GEMM_UKR_PROT(double, d, gemm_sifive_x280_asm_7m4) +GEMM_UKR_PROT(scomplex, c, gemm_sifive_x280_asm_6m2) +GEMM_UKR_PROT(dcomplex, z, gemm_sifive_x280_asm_6m2) + +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_l_sifive_x280_asm) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_l_sifive_x280_asm) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_l_sifive_x280_asm) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_l_sifive_x280_asm) +GEMMTRSM_UKR_PROT(float, s, gemmtrsm_u_sifive_x280_asm) +GEMMTRSM_UKR_PROT(double, d, gemmtrsm_u_sifive_x280_asm) +GEMMTRSM_UKR_PROT(scomplex, c, gemmtrsm_u_sifive_x280_asm) +GEMMTRSM_UKR_PROT(dcomplex, z, gemmtrsm_u_sifive_x280_asm) diff --git a/kernels/sifive_x280/riscv_cmul_macros_asm.h b/kernels/sifive_x280/riscv_cmul_macros_asm.h new file mode 100644 index 0000000000..9c33fd7bc5 --- /dev/null +++ b/kernels/sifive_x280/riscv_cmul_macros_asm.h @@ -0,0 +1,137 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// macros to emit complex multiplication +// caveat: the destination registers cannot overlap the source registers! +// rd = rs1 * rs2 +#define cmul(rd_r, rd_i, rs1_r, rs1_i, rs2_r, rs2_i) \ + \ + __asm__(FMUL#rd_r", "#rs1_r", "#rs2_r);\ + __asm__(FMUL#rd_i", "#rs1_r", "#rs2_i);\ + __asm__(FNMSUB#rd_r", "#rs1_i", "#rs2_i", "#rd_r);\ + __asm__(FMADD#rd_i", "#rs1_i", "#rs2_r", "#rd_i) + +// vd = vs2 * f[rs1] +#define vcmul_vf(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ + \ + __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\ + __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\ + __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\ + __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i) + +#define vcmul_vf2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ + \ + __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\ + __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\ + __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ + __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) + +// vd = conj(vs2) * f[rs1] +#define vcmul_vf_conj(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ + \ + __asm__("vfmul.vf "#vd_r", "#vs2_r", "#rs1_r);\ + __asm__("vfmul.vf "#vd_i", "#vs2_r", "#rs1_i);\ + __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ + __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) + +#define vcmul_vf_conj2(vd_r, vd_i, vs2_r, vs2_i, rs1_r, rs1_i) \ + \ + __asm__("vfmul.vf "#vd_r", "#vs2_r", %0" : : "f"(rs1_r));\ + __asm__("vfmul.vf "#vd_i", "#vs2_r", %0" : : "f"(rs1_i));\ + __asm__("vfmacc.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ + __asm__("vfnmsac.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) + +// vd += vs2 * f[rs1] +#define vcmacc_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ + \ + __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\ + __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\ + __asm__("vfnmsac.vf "#vd_r", "#rs1_i", "#vs2_i);\ + __asm__("vfmacc.vf "#vd_i", "#rs1_r", "#vs2_i) + +#define vcmacc_vf2(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ + \ + __asm__("vfmacc.vf "#vd_r", %0, "#vs2_r : : "f"(rs1_r));\ + __asm__("vfmacc.vf "#vd_i", %0, "#vs2_r : : "f"(rs1_i));\ + __asm__("vfnmsac.vf "#vd_r", %0, "#vs2_i : : "f"(rs1_i));\ + __asm__("vfmacc.vf "#vd_i", %0, "#vs2_i : : "f"(rs1_r)) + +// vd += conj(vs2) * f[rs1] +#define vcmacc_vf_conj(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ + \ + __asm__("vfmacc.vf "#vd_r", "#rs1_r", "#vs2_r);\ + __asm__("vfmacc.vf "#vd_i", "#rs1_i", "#vs2_r);\ + __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ + __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) + +// vd -= vs2 * f[rs1] +#define vcnmsac_vf(vd_r, vd_i, rs1_r, rs1_i, vs2_r, vs2_i) \ + \ + __asm__("vfnmsac.vf "#vd_r", "#rs1_r", "#vs2_r);\ + __asm__("vfnmsac.vf "#vd_i", "#rs1_i", "#vs2_r);\ + __asm__("vfmacc.vf "#vd_r", "#rs1_i", "#vs2_i);\ + __asm__("vfnmsac.vf "#vd_i", "#rs1_r", "#vs2_i) + +// vd = vs2 * vs1 +#define vcmul_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ + \ + __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\ + __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\ + __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\ + __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) + +// vd = vs2 * conj(vs1) +#define vcmul_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ + \ + __asm__("vfmul.vv "#vd_r", "#vs2_r", "#vs1_r);\ + __asm__("vfmul.vv "#vd_i", "#vs2_r", "#vs1_i);\ + __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\ + __asm__("vfmsac.vv "#vd_i", "#vs2_i", "#vs1_r) + +// vd += vs2 * vs1 +#define vcmacc_vv(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ + \ + __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\ + __asm__("vfmacc.vv "#vd_i", "#vs2_r", "#vs1_i);\ + __asm__("vfnmsac.vv "#vd_r", "#vs2_i", "#vs1_i);\ + __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) + +// vd += vs2 * conj(vs1) +#define vcmacc_vv_conj(vd_r, vd_i, vs2_r, vs2_i, vs1_r, vs1_i) \ + \ + __asm__("vfmacc.vv "#vd_r", "#vs2_r", "#vs1_r);\ + __asm__("vfnmsac.vv "#vd_i", "#vs2_r", "#vs1_i);\ + __asm__("vfmacc.vv "#vd_r", "#vs2_i", "#vs1_i);\ + __asm__("vfmacc.vv "#vd_i", "#vs2_i", "#vs1_r) + diff --git a/kernels/sifive_x280/riscv_overloaded_intrinsics.h b/kernels/sifive_x280/riscv_overloaded_intrinsics.h new file mode 100644 index 0000000000..6a1d11b131 --- /dev/null +++ b/kernels/sifive_x280/riscv_overloaded_intrinsics.h @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2023, SiFive, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +// 6. Configuration-Setting and Utility Functions +#define RVV_TYPE_F_(PRECISION, LMUL) vfloat##PRECISION##LMUL##_t +#define RVV_TYPE_F(PRECISION, LMUL) RVV_TYPE_F_(PRECISION, LMUL) +#define RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) vfloat##PRECISION##LMUL##x##NFIELDS##_t +#define RVV_TYPE_FX(PRECISION, LMUL, NFIELDS) RVV_TYPE_FX_(PRECISION, LMUL, NFIELDS) +#define VSETVL_(PRECISION, LMUL) __riscv_vsetvl_e##PRECISION##LMUL +#define VSETVL(PRECISION, LMUL) VSETVL_(PRECISION, LMUL) + +// 7. Vector Loads and Stores +// Loads +#define VLE_V_F_(PRECISION, LMUL) __riscv_vle##PRECISION##_v_f##PRECISION##LMUL +#define VLE_V_F(PRECISION, LMUL) VLE_V_F_(PRECISION, LMUL) +#define VLSE_V_F_(PRECISION, LMUL) __riscv_vlse##PRECISION##_v_f##PRECISION##LMUL +#define VLSE_V_F(PRECISION, LMUL) VLSE_V_F_(PRECISION, LMUL) +#define VLSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VLSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vlsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VLSSEG2_V_F(PRECISION, LMUL, NFIELDS) VLSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +// Stores +#define VSE_V_F_(PRECISION, LMUL) __riscv_vse##PRECISION##_v_f##PRECISION##LMUL +#define VSE_V_F(PRECISION, LMUL) VSE_V_F_(PRECISION, LMUL) +#define VSSE_V_F_(PRECISION, LMUL) __riscv_vsse##PRECISION##_v_f##PRECISION##LMUL +#define VSSE_V_F(PRECISION, LMUL) VSSE_V_F_(PRECISION, LMUL) +#define VSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vsseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSEG2_V_F_(PRECISION, LMUL, NFIELDS) +#define VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vssseg2e##PRECISION##_v_f##PRECISION##LMUL##x##NFIELDS +#define VSSSEG2_V_F(PRECISION, LMUL, NFIELDS) VSSSEG2_V_F_(PRECISION, LMUL, NFIELDS) + +// 13. Vector Floating-Point Operations +#define VFADD_VV_(PRECISION, LMUL) __riscv_vfadd_vv_f##PRECISION##LMUL +#define VFADD_VV(PRECISION, LMUL) VFADD_VV_(PRECISION, LMUL) +#define VFSUB_VV_(PRECISION, LMUL) __riscv_vfsub_vv_f##PRECISION##LMUL +#define VFSUB_VV(PRECISION, LMUL) VFSUB_VV_(PRECISION, LMUL) +#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL +#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) +#define VFMUL_VV_(PRECISION, LMUL) __riscv_vfmul_vv_f##PRECISION##LMUL +#define VFMUL_VV(PRECISION, LMUL) VFMUL_VV_(PRECISION, LMUL) +#define VFMUL_VF_(PRECISION, LMUL) __riscv_vfmul_vf_f##PRECISION##LMUL +#define VFMUL_VF(PRECISION, LMUL) VFMUL_VF_(PRECISION, LMUL) +#define VFMACC_VF_(PRECISION, LMUL) __riscv_vfmacc_vf_f##PRECISION##LMUL +#define VFMACC_VF(PRECISION, LMUL) VFMACC_VF_(PRECISION, LMUL) +#define VFMACC_VV_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL +#define VFMACC_VV(PRECISION, LMUL) VFMACC_VV_(PRECISION, LMUL) +#define VFMACC_VV_TU_(PRECISION, LMUL) __riscv_vfmacc_vv_f##PRECISION##LMUL##_tu +#define VFMACC_VV_TU(PRECISION, LMUL) VFMACC_VV_TU_(PRECISION, LMUL) +#define VFMSAC_VF_(PRECISION, LMUL) __riscv_vfmsac_vf_f##PRECISION##LMUL +#define VFMSAC_VF(PRECISION, LMUL) VFMSAC_VF_(PRECISION, LMUL) +#define VFNMSAC_VF_(PRECISION, LMUL) __riscv_vfnmsac_vf_f##PRECISION##LMUL +#define VFNMSAC_VF(PRECISION, LMUL) VFNMSAC_VF_(PRECISION, LMUL) +#define VFNMSAC_VV_TU_(PRECISION, LMUL) __riscv_vfnmsac_vv_f##PRECISION##LMUL##_tu +#define VFNMSAC_VV_TU(PRECISION, LMUL) VFNMSAC_VV_TU_(PRECISION, LMUL) +#define VFMADD_VF_(PRECISION, LMUL) __riscv_vfmadd_vf_f##PRECISION##LMUL +#define VFMADD_VF(PRECISION, LMUL) VFMADD_VF_(PRECISION, LMUL) +#define VFMSUB_VF_(PRECISION, LMUL) __riscv_vfmsub_vf_f##PRECISION##LMUL +#define VFMSUB_VF(PRECISION, LMUL) VFMSUB_VF_(PRECISION, LMUL) +#define VFNEG_VF_(PRECISION, LMUL) __riscv_vfneg_v_f##PRECISION##LMUL +#define VFNEG_VF(PRECISION, LMUL) VFNEG_VF_(PRECISION, LMUL) +#define VFMV_V_V_(PRECISION, LMUL) VREINTERPRET_V_I_F(PRECISION, LMUL)( __riscv_vmv_v_v_i##PRECISION##LMUL( VREINTERPRET_V_F_I(PRECISION, LMUL) CURRY_1ARG +#define VFMV_V_V(PRECISION, LMUL) VFMV_V_V_(PRECISION, LMUL) + +// 14. Vector Reduction Operations +#define VF_REDUSUM_VS_(PRECISION, LMUL) __riscv_vfredusum_vs_f##PRECISION##LMUL##_f##PRECISION##m1 +#define VF_REDUSUM_VS(PRECISION, LMUL) VF_REDUSUM_VS_(PRECISION, LMUL) + +// 16. Vector Permutation Operations +#define VFMV_S_F_(PRECISION, LMUL) __riscv_vfmv_s_f_f##PRECISION##LMUL +#define VFMV_S_F(PRECISION, LMUL) VFMV_S_F_(PRECISION, LMUL) +#define VFMV_F_S_(PRECISION) __riscv_vfmv_f_s_f##PRECISION##m1_f##PRECISION +#define VFMV_F_S(PRECISION) VFMV_F_S_(PRECISION) + +// Miscellaneous Vector Function +#define VREINTERPRET_V_I_F_(PRECISION, LMUL) __riscv_vreinterpret_v_i##PRECISION##LMUL##_f##PRECISION##LMUL +#define VREINTERPRET_V_I_F(PRECISION, LMUL) VREINTERPRET_V_I_F_(PRECISION, LMUL) +#define VREINTERPRET_V_F_I_(PRECISION, LMUL) __riscv_vreinterpret_v_f##PRECISION##LMUL##_i##PRECISION##LMUL +#define VREINTERPRET_V_F_I(PRECISION, LMUL) VREINTERPRET_V_F_I_(PRECISION, LMUL) +#define VGET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vget_v_f##PRECISION##LMUL##x##NFIELDS##_f##PRECISION##LMUL +#define VGET_V_F(PRECISION, LMUL, NFIELDS) VGET_V_F_(PRECISION, LMUL, NFIELDS) +#define VSET_V_F_(PRECISION, LMUL, NFIELDS) __riscv_vset_v_f##PRECISION##LMUL##_f##PRECISION##LMUL##x##NFIELDS +#define VSET_V_F(PRECISION, LMUL, NFIELDS) VSET_V_F_(PRECISION, LMUL, NFIELDS) + +// Non-vector functions +#define CURRY_1ARG(arg1, ...) (arg1), __VA_ARGS__)) diff --git a/travis/do_riscv.sh b/travis/do_riscv.sh index a51d33061a..56c2b85c26 100755 --- a/travis/do_riscv.sh +++ b/travis/do_riscv.sh @@ -3,16 +3,19 @@ set -e set -x -TAG=2023.02.25 +TAG=2023.10.18 # The prebuilt toolchains only support hardfloat, so we only # test these for now. case $1 in "rv32iv") - TARBALL=riscv32-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz + TARBALL=riscv32-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz ;; "rv64iv") - TARBALL=riscv64-glibc-ubuntu-20.04-nightly-${TAG}-nightly.tar.gz + TARBALL=riscv64-glibc-ubuntu-20.04-gcc-nightly-${TAG}-nightly.tar.gz + ;; + "sifive_x280") + TARBALL=riscv64-glibc-ubuntu-20.04-llvm-nightly-${TAG}-nightly.tar.gz ;; *) exit 1