diff --git a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c index ddf7f9da1f..c423dd131d 100644 --- a/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_amaxv_sifive_x280_asm.c @@ -85,8 +85,7 @@ void bli_samaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); __asm__("vmerge.vvm v16, v16, v24, v0"); } - inc_t tmp = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); offset += vl; avl -= vl; } @@ -147,8 +146,7 @@ void bli_damaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); __asm__("vmerge.vvm v16, v16, v24, v0"); } - inc_t tmp = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); offset += vl; avl -= vl; } @@ -214,8 +212,7 @@ void bli_camaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); __asm__("vmerge.vvm v16, v16, v24, v0"); } - inc_t tmp = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); offset += vl; avl -= vl; } @@ -278,8 +275,7 @@ void bli_zamaxv_sifive_x280_asm(dim_t n, const void * restrict x_, inc_t incx, __asm__("vadd.vx v24, v24, %0" : : "r"(offset)); __asm__("vmerge.vvm v16, v16, v24, v0"); } - inc_t tmp = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); offset += vl; avl -= vl; } diff --git a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c index 75332e61bd..3571877759 100644 --- a/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_copyv_sifive_x280_asm.c @@ -50,6 +50,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, (void)cntx; const float* restrict x = x_; float* restrict y = y_; + if (n <= 0) + return; incx *= FLT_SIZE; incy *= FLT_SIZE; @@ -69,10 +71,8 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return; @@ -93,8 +93,11 @@ void bli_scopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, inc_t incx, void * restrict y_, inc_t incy, const cntx_t *cntx) { (void)conjx; + (void)cntx; const double* restrict x = x_; double* restrict y = y_; + if (n <= 0) + return; incx *= FLT_SIZE; incy *= FLT_SIZE; @@ -114,10 +117,8 @@ void bli_dcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return; @@ -144,6 +145,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, (void)cntx; const scomplex* restrict x = x_; scomplex* restrict y = y_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; incy *= 2 * FLT_SIZE; @@ -164,10 +167,8 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } } else { @@ -189,50 +190,10 @@ void bli_ccopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } - /* - // After some benchmarks, it looks like using vl(s)e and vs(s)e with - masked - // instructions for conjugation is faster than using segment loads and - stores. - // We'll use the segment load/store version for now, but I'd like to - leave this - // code here (but commented out) for possible future use. - size_t avl = n; - // 0xA = 0b1010 - // this masks off the real parts, so only the imaginary parts are - negated - // this mask is large enough only for vl <= 64 - uint64_t mask[1] = {0xAAAAAAAAAAAAAAAA}; - __asm__("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vle64.v v0, (%0)" : : "r"(mask)); - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : - "r"(avl)); if (incx == 8) - __asm__("vle64.v v4, (%0)" : : "r"(x)); - else - __asm__("vlse64.v v4, (%0), %1" : : "r"(x), "r"(incx)); - // set vl = VLMAX - __asm__ volatile("vsetvli t0, zero, e32, m4, ta, ma"); - __asm__("vfneg.v v4, v4, v0.t"); - __asm__ volatile ("vsetvli zero, %0, e64, m4, ta, ma" : : "r"(avl)); - if (incy == 8) - __asm__("vse64.v v4, (%0)" : : "r"(y)); - else - __asm__("vsse64.v v4, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); - avl -= vl; - } - */ } return; } @@ -263,6 +224,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, (void)cntx; const dcomplex* restrict x = x_; dcomplex* restrict y = y_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; incy *= 2 * FLT_SIZE; @@ -300,10 +263,8 @@ void bli_zcopyv_sifive_x280_asm(conj_t conjx, dim_t n, const void * restrict x_, else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } } diff --git a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c index 4d18913480..cbca885929 100644 --- a/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_invertv_sifive_x280_asm.c @@ -49,6 +49,8 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, const cntx_t *cntx) { (void)cntx; float* restrict x = x_; + if (n <= 0) + return; float one = 1.f; __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); @@ -68,8 +70,7 @@ void bli_sinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, __asm__("vfrdiv.vf v0, v0, f0"); __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -93,6 +94,8 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, const cntx_t *cntx) { (void)cntx; double* restrict x = x_; + if (n <= 0) + return; double one = 1.; __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); @@ -112,8 +115,7 @@ void bli_dinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, __asm__("vfrdiv.vf v0, v0, f0"); __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -136,6 +138,8 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, const cntx_t *cntx) { (void)cntx; scomplex* restrict x = x_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; size_t avl = n; @@ -161,8 +165,7 @@ void bli_cinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, __asm__("vfdiv.vv v4, v4, v8"); __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -184,6 +187,8 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, const cntx_t *cntx) { (void)cntx; dcomplex* restrict x = x_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; size_t avl = n; @@ -209,8 +214,7 @@ void bli_zinvertv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, __asm__("vfdiv.vv v4, v4, v8"); __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; diff --git a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c index 5b2487aeaf..51edc92214 100644 --- a/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_invscalv_sifive_x280_asm.c @@ -53,6 +53,8 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr (void)cntx; const float* restrict alpha = alpha_; float* restrict x = x_; + if (n <= 0 || *alpha == 0.f || *alpha == 1.f) + return; float one = 1.f; __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); @@ -74,8 +76,7 @@ void bli_sinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr __asm__("vfmul.vf v0, v0, f0"); __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -104,6 +105,8 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr (void)cntx; const double* restrict alpha = alpha_; double* restrict x = x_; + if (n <= 0 || *alpha == 0. || *alpha == 1.) + return; double one = 1.; __asm__(FLT_LOAD "f0, (%0)" : : "r"(&one)); @@ -125,8 +128,7 @@ void bli_dinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr __asm__("vfmul.vf v0, v0, f0"); __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -157,6 +159,8 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr (void)cntx; const scomplex* restrict alpha = alpha_; scomplex* restrict x = x_; + if (n <= 0 || (alpha->real == 0.f && alpha->imag == 0.f) || (alpha->real == 1.f && alpha->imag == 0.f)) + return; __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); @@ -188,8 +192,7 @@ void bli_cinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr __asm__("vfmacc.vf v12, f1, v0"); __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -223,6 +226,8 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr (void)cntx; const dcomplex* restrict alpha = alpha_; dcomplex* restrict x = x_; + if (n <= 0 || (alpha->real == 0. && alpha->imag == 0.) || (alpha->real == 1. && alpha->imag == 0.)) + return; __asm__(FLT_LOAD "f0, (%0)" : : "r"(alpha)); __asm__(FLT_LOAD "f1, %1(%0)" : : "r"(alpha), "I"(FLT_SIZE)); @@ -254,8 +259,7 @@ void bli_zinvscalv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restr __asm__("vfmacc.vf v12, f1, v0"); __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(x), "r"(incx)); } - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; diff --git a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c index 422f2c68a4..ef9091f16c 100644 --- a/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_setv_sifive_x280_asm.c @@ -49,6 +49,8 @@ void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict (void)cntx; const float* restrict alpha = alpha_; float* restrict x = x_; + if (n <= 0) + return; __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" : @@ -66,8 +68,7 @@ void bli_ssetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict __asm__(VSE "v0, (%0)" : : "r"(x)); else __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -89,6 +90,8 @@ void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict (void)cntx; const double* restrict alpha = alpha_; double* restrict x = x_; + if (n <= 0) + return; __asm__ volatile("vsetvli zero, %0, e%1, m8, ta, ma" : @@ -106,8 +109,7 @@ void bli_dsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict __asm__(VSE "v0, (%0)" : : "r"(x)); else __asm__(VSSE "v0, (%0), %1" : : "r"(x), "r"(incx)); - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; @@ -128,6 +130,8 @@ void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict (void)cntx; const scomplex* restrict alpha = alpha_; scomplex* restrict x = x_; + if (n <= 0) + return; __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : @@ -149,42 +153,10 @@ void bli_csetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; - - /*(void) cntx; - // See comment in ccopyv. - // We probably won't have to worry about this once the vlseg2/vsseg2 - performance bug is fixed. - __asm__ volatile("vsetvli t0, zero, e%0, m4, ta, ma" : : "i"(8 * 2 * - FLT_SIZE)); - __asm__(VLSE "v4, (%0), zero" : : "r"(alpha)); - incx *= 2 * FLT_SIZE; - if (conjalpha == BLIS_CONJUGATE) { - uint64_t mask = 0xAAAAAAAAAAAAAAAA; - __asm__("vsetivli zero, 1, e64, m1, ta, ma"); - __asm__("vle64.v v0, (%0)" : : "r"(&mask)); - __asm__ volatile("vsetvli t0, zero, e%0, m4, ta, ma" : : "i"(8 * - FLT_SIZE)); - __asm__("vfneg.v v4, v4, v0.t"); - } - - size_t avl = n; - while (avl) { - size_t vl; - __asm__ volatile("vsetvli %0, %1, e%2, m4, ta, ma" : "=r"(vl) : "r"(avl), - "i"(8 * 2 * FLT_SIZE)); if (incx == 2 * FLT_SIZE) - __asm__(VSE "v4, (%0)" : : "r"(x)); - else - __asm__(VSSE "v4, (%0), %1" : : "r"(x), "r"(incx)); - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - avl -= vl; - } - return;*/ } #undef FLT_SIZE @@ -202,6 +174,8 @@ void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict (void)cntx; const dcomplex* restrict alpha = alpha_; dcomplex* restrict x = x_; + if (n <= 0) + return; __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : @@ -223,8 +197,7 @@ void bli_zsetv_sifive_x280_asm(conj_t conjalpha, dim_t n, const void * restrict __asm__(VSSEG2 "v0, (%0)" : : "r"(x)); else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(x), "r"(incx)); - inc_t tmp1 = vl * incx; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); avl -= vl; } return; diff --git a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c index da3b61ddc0..2342e254a2 100644 --- a/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c +++ b/kernels/sifive_x280/1/bli_swapv_sifive_x280_asm.c @@ -49,6 +49,8 @@ void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * (void)cntx; float* restrict x = x_; float* restrict y = y_; + if (n <= 0) + return; incx *= FLT_SIZE; incy *= FLT_SIZE; @@ -76,10 +78,8 @@ void bli_sswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, void * else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return; @@ -102,6 +102,8 @@ void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, (void)cntx; double* restrict x = x_; double* restrict y = y_; + if (n <= 0) + return; incx *= FLT_SIZE; incy *= FLT_SIZE; @@ -129,10 +131,8 @@ void bli_dswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return; @@ -155,6 +155,8 @@ void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, (void)cntx; scomplex* restrict x = x_; scomplex* restrict y = y_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; incy *= 2 * FLT_SIZE; @@ -182,10 +184,8 @@ void bli_cswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, else __asm__(VSSE "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return; @@ -208,6 +208,8 @@ void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, (void)cntx; dcomplex* restrict x = x_; dcomplex* restrict y = y_; + if (n <= 0) + return; incx *= 2 * FLT_SIZE; incy *= 2 * FLT_SIZE; @@ -235,10 +237,8 @@ void bli_zswapv_sifive_x280_asm(dim_t n, void * restrict x_, inc_t incx, else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(y), "r"(incy)); - inc_t tmp1 = vl * incx; - inc_t tmp2 = vl * incy; - __asm__("add %0, %0, %1" : "+r"(x) : "r"(tmp1)); - __asm__("add %0, %0, %1" : "+r"(y) : "r"(tmp2)); + __asm__("add %0, %0, %1" : "+r"(x) : "r"(vl * incx)); + __asm__("add %0, %0, %1" : "+r"(y) : "r"(vl * incy)); avl -= vl; } return;