From 10cf9f8bedf20669e1624a4842eb7fd8edf70a71 Mon Sep 17 00:00:00 2001 From: bill lam Date: Fri, 15 Nov 2024 11:49:26 +0800 Subject: [PATCH 1/4] 9!:65 enable/disable boxed sparse --- jsrc/cu.c | 5 ++++ jsrc/j.c | 3 +++ jsrc/je.h | 1 + jsrc/v1.c | 6 ++++- jsrc/vo.c | 10 +++++++- jsrc/x.c | 69 ++++++++++++++++++++++++++++--------------------------- jsrc/xa.c | 25 +++++++++++++++++++- 7 files changed, 82 insertions(+), 37 deletions(-) diff --git a/jsrc/cu.c b/jsrc/cu.c index 592c70b23..37a88ec0e 100644 --- a/jsrc/cu.c +++ b/jsrc/cu.c @@ -6,6 +6,9 @@ #include "j.h" #include "ve.h" +#ifdef BOXEDSPARSE +extern UC fboxedsparse; +#endif static A jteverysp(J jt,A w,A fs){A*wv,x,z,*zv;P*wp,*zp; ARGCHK1(w); @@ -124,6 +127,8 @@ A jtevery(J jt, A w, A fs){A * RESTRICT wv,x,z,* RESTRICT zv; } #ifndef BOXEDSPARSE ASSERT(!ISSPARSE(AT(x)),EVNONCE); +#else + ASSERT(fboxedsparse||!ISSPARSE(AT(x)),EVNONCE); #endif // Store result & advance to next cell *zv++=x; diff --git a/jsrc/j.c b/jsrc/j.c index 05cd56248..379a23748 100644 --- a/jsrc/j.c +++ b/jsrc/j.c @@ -125,6 +125,9 @@ uint64_t g_cpuFeatures2; // fsgsbase int numberOfCores; // number of cpu cores UC hwaes=0; // hardware aes support UC hwfma=0; // blis cpu tuning +#ifdef BOXEDSPARSE +UC fboxedsparse=1; // enable boxed sparse +#endif I fortesting=0; // used for measurements // globals end diff --git a/jsrc/je.h b/jsrc/je.h index 7dce1461d..16c5e3637 100644 --- a/jsrc/je.h +++ b/jsrc/je.h @@ -27,6 +27,7 @@ extern F1(jtbehead); extern F1(jtbinrep1); // extern F1(jtbitadv); extern F1(jtbox); +extern F1(jtboxedsparse); extern F1(jtboxopen); extern F1(jtboxq); extern F1(jtboxs); diff --git a/jsrc/v1.c b/jsrc/v1.c index 76d02175b..c4cf92c5a 100644 --- a/jsrc/v1.c +++ b/jsrc/v1.c @@ -7,6 +7,10 @@ #include "vcomp.h" #include "ve.h" +#ifdef BOXEDSPARSE +extern UC fboxedsparse; +#endif + #ifdef MMSC_VER #pragma warning(disable: 4244) #endif @@ -405,7 +409,7 @@ static B jtmatchsub(J jt,A a,A w,B* RESTRICT x,I af,I wf,I m,I n,I b1){C*av,*wv; if(unlikely(t&FUNC))R (!eqf(a,w))^(x==0?1:b1); // true value, but switch if return is not 'match' if(unlikely(t&NAME))R (!eqname(a,w))^(x==0?1:b1); // true value, but switch if return is not 'match' #ifdef BOXEDSPARSE - if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w); + if(fboxedsparse) if(unlikely(ISSPARSE(at|wt)))R num(1)==matchs(a,w); #endif // If the types mismatch, convert as needed to the common (unsafe) type calculated earlier if(at!=wt) { diff --git a/jsrc/vo.c b/jsrc/vo.c index 4872a6553..3a643e56f 100644 --- a/jsrc/vo.c +++ b/jsrc/vo.c @@ -6,6 +6,10 @@ #define ZZDEFN #include "result.h" +#ifdef BOXEDSPARSE +extern UC fboxedsparse; +#endif + I level(J jt,A w){A*wv;I d,j; ARGCHK1(w); if((-AN(w)&-(AT(w)&BOX))>=0)R 0; @@ -31,6 +35,8 @@ F1(jtbox){A y,z,*zv;C*wv;I f,k,m,n,r,wr,*ws; F1PREFIP;ARGCHK1(w);I wt=AT(w); FLAGT waf=AFLAG(w); #ifndef BOXEDSPARSE ASSERTF(!ISSPARSE(wt),EVNONCE,"can't box sparse arrays"); +#else + ASSERTF(fboxedsparse||!ISSPARSE(wt),EVNONCE,"can't box sparse arrays"); #endif wr=AR(w); r=(RANKT)jt->ranks; r=wrlocaluse.lu1.linkvb; // flag: sign set if (,<) or ,&< or (;<) which will always box w; bit 0 set if (,<) optype|=((I)jtinplace&JTWILLBEOPENED)<<(BOXX-JTWILLBEOPENEDX); // fold in BOX flag that tells us to allow virtual boxed results diff --git a/jsrc/x.c b/jsrc/x.c index d0b8aa15d..e8fac9772 100644 --- a/jsrc/x.c +++ b/jsrc/x.c @@ -115,8 +115,8 @@ void jtforeigninit(J jt){UI i; MN(4,0) XPRIM(VERB, jtnc, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(6,0) XPRIM(VERB, jtts0, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(6,1) XPRIM(VERB, jttss, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(6,14) XPRIM(VERB, jtinttoe, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(6,15) XPRIM(VERB, jtetoint, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(6,14) XPRIM(VERB, jtinttoe, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(6,15) XPRIM(VERB, jtetoint, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(6,16) XPRIM(VERB, jtetoiso8601,jtetoiso8601, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(6,17) XPRIM(VERB, jtiso8601toe,jtiso8601toe, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(6,18) XPRIM(VERB, jtstringtoe,jtstringtoe, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); @@ -138,10 +138,10 @@ void jtforeigninit(J jt){UI i; MN(18,5) XPRIM(VERB, jtlocname, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(128,2) XPRIM(VERB, 0, jtapplystr, VFLAGNONE,VF2NONE,RMAX,1, RMAX); MN(128,5) XPRIM(VERB, jtisnan, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(128,9) XPRIM(VERB, jtmvmsparse, 0, VASGSAFE,VF2WILLOPEN1,RMAX, RMAX,RMAX); + MN(128,9) XPRIM(VERB, jtmvmsparse, 0, VASGSAFE,VF2WILLOPEN1,RMAX, RMAX,RMAX); MN(128,11) XPRIM(VERB, 0, jtlrtrim, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(128,12) XPRIM(VERB, 0, jtekupdate, VASGSAFE|VJTFLGOK2,VF2WILLOPEN2A,RMAX,RMAX,RMAX); - MN(128,13) XPRIM(VERB, jtfindspr, 0, VASGSAFE|VJTFLGOK2,VF2WILLOPEN2A,RMAX,RMAX,RMAX); + MN(128,12) XPRIM(VERB, 0, jtekupdate, VASGSAFE|VJTFLGOK2,VF2WILLOPEN2A,RMAX,RMAX,RMAX); + MN(128,13) XPRIM(VERB, jtfindspr, 0, VASGSAFE|VJTFLGOK2,VF2WILLOPEN2A,RMAX,RMAX,RMAX); // infrequently-used fns follow @@ -197,15 +197,15 @@ void jtforeigninit(J jt){UI i; MN(4,3) XPRIM(VERB, jtsnl, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(4,4) XPRIM(VERB, jtscind, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(4,5) XPRIM(VERB, jtnch, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(4,6) XPRIM(VERB, jtscriptstring, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(4,7) XPRIM(VERB, jtscriptnum, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(4,6) XPRIM(VERB, jtscriptstring, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(4,7) XPRIM(VERB, jtscriptnum, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(4,8) XPRIM(ADV, jtcreatecachedref,jtvalenceerr, VASGSAFE,VF2NONE,0L, 0L, 0L ); MN(4,55) XPRIM(VERB, jtex, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(5,0) XPRIM(ADV, jtfxx,jtvalenceerr, VASGSAFE,VF2NONE,0L, 0L, 0L ); MN(5,1) XPRIM(VERB, jtarx, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(5,2) XPRIM(VERB, jtdrx, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(5,4) XPRIM(VERB, jttrx, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); - MN(5,5) XPRIM(VERB, jtlrx1, jtlrx2, VASGSAFE,VF2NONE,0, RMAX,RMAX); + MN(5,5) XPRIM(VERB, jtlrx1, jtlrx2, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(5,6) XPRIM(VERB, jtprx, 0, VASGSAFE,VF2NONE,0, RMAX,RMAX); MN(5,7) XPRIM(VERB, 0, jtxrx, VASGSAFE,VF2NONE,RMAX,0, 0 ); // explicit defn info MN(6,2) XPRIM(VERB, jttsit1, jttsit2, VFLAGNONE,VF2NONE,1, 0, 1 ); @@ -256,8 +256,8 @@ void jtforeigninit(J jt){UI i; MN(9,27) XPRIM(VERB, jtieps, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,28) XPRIM(VERB, jtiepdoq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,29) XPRIM(VERB, jtiepdos, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,32) XPRIM(VERB, jtecmtriesq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,33) XPRIM(VERB, jtecmtriess, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,32) XPRIM(VERB, jtecmtriesq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,33) XPRIM(VERB, jtecmtriess, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,34) XPRIM(VERB, jtassertq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,35) XPRIM(VERB, jtasserts, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,36) XPRIM(VERB, jtoutparmq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); @@ -274,19 +274,20 @@ void jtforeigninit(J jt){UI i; MN(9,47) XPRIM(VERB, jtbreakfns, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,52) XPRIM(VERB, jtasgzombq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(9,53) XPRIM(VERB, jtasgzombs, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,54) XPRIM(VERB, jtdeprecxq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,55) XPRIM(VERB, jtdeprecxs, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,54) XPRIM(VERB, jtdeprecxq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,55) XPRIM(VERB, jtdeprecxs, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); #if MEMHISTO - MN(9,54) XPRIM(VERB, jtmemhistoq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,55) XPRIM(VERB, jtmemhistos, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,62) XPRIM(VERB, jtmemhashq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,63) XPRIM(VERB, jtmemhashs, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,54) XPRIM(VERB, jtmemhistoq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,55) XPRIM(VERB, jtmemhistos, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,62) XPRIM(VERB, jtmemhashq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,63) XPRIM(VERB, jtmemhashs, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); #endif MN(9,56) XPRIM(VERB, jtcpufeature, jtcpufeature2, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,57) XPRIM(VERB, jtaudittdisab, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,57) XPRIM(VERB, jtaudittdisab, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(9,58) XPRIM(VERB, jtgemmtune, jtgemmtune2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,59) XPRIM(VERB, jtemsglevel, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,66) XPRIM(VERB, jtcheckcompfeatures, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,59) XPRIM(VERB, jtemsglevel, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,65) XPRIM(VERB, jtboxedsparse, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,66) XPRIM(VERB, jtcheckcompfeatures, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(13,0) XPRIM(VERB, jtdbc, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,1) XPRIM(VERB, jtdbstack, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,2) XPRIM(VERB, jtdbstopq, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); @@ -298,7 +299,7 @@ void jtforeigninit(J jt){UI i; MN(13,8) XPRIM(VERB, jtdbsig1, jtdbsig2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,9) XPRIM(VERB, jtdbrr1, jtdbrr2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,10) XPRIM(VERB, 0,0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); // still in stdlib - MN(13,11) XPRIM(VERB, jtdberr, jtdberr2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,11) XPRIM(VERB, jtdberr, jtdberr2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,12) XPRIM(VERB, jtdbetx, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,13) XPRIM(VERB, jtdbcall, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,14) XPRIM(VERB, jtdbtrapq, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); @@ -311,7 +312,7 @@ void jtforeigninit(J jt){UI i; MN(13,21) XPRIM(VERB, jtdbstepinto1,jtdbstepinto2,VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,22) XPRIM(VERB, jtdbstepout1, jtdbstepout2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,23) XPRIM(VERB, jtdbpasss, jtdbpasss, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(13,24) XPRIM(VERB, jtdbisolatestk, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,24) XPRIM(VERB, jtdbisolatestk, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(15,5) XPRIM(VERB, jtcdf, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(15,7) XPRIM(VERB, jtdllsymset, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(15,8) XPRIM(VERB, jtgh15, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); @@ -334,7 +335,7 @@ void jtforeigninit(J jt){UI i; MN(15,23) XPRIM(VERB, jtcdq, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(18,0) XPRIM(VERB, jtlocnc, 0, VFLAGNONE,VF2NONE,0, RMAX,RMAX); MN(18,1) XPRIM(VERB, jtlocnl1, jtlocnl2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(18,55) XPRIM(VERB, jtlocexmark, jtlocexmark, VFLAGNONE,VF2NONE,0, RMAX,RMAX); + MN(18,55) XPRIM(VERB, jtlocexmark, jtlocexmark, VFLAGNONE,VF2NONE,0, RMAX,RMAX); MN(128,0) XPRIM(VERB, jtqr, 0, VASGSAFE,VF2NONE,2, RMAX,RMAX); MN(128,1) XPRIM(VERB, jtrinv, 0, VASGSAFE,VF2NONE,2, RMAX,RMAX); MN(128,3) XPRIM(VERB, jtcrc1, jtcrc2, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); @@ -342,34 +343,34 @@ void jtforeigninit(J jt){UI i; MN(128,6) XPRIM(VERB, jtshasum1, jtshasum2, VASGSAFE,VF2NONE,1,1,RMAX); MN(128,7) XPRIM(VERB, 0, jtaes2, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(128,8) XPRIM(VERB, jtqhash12, jtqhash12, VASGSAFE|VJTFLGOK1|VJTFLGOK2,VF2NONE,RMAX,RMAX,RMAX); - MN(128,10) XPRIM(VERB, jtludecomp, jtludecomp, VASGSAFE,VF2NONE,RMAX, RMAX,RMAX); - MN(18,6) XPRIM(VERB, jtresetbloom, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(128,10) XPRIM(VERB, jtludecomp, jtludecomp, VASGSAFE,VF2NONE,RMAX, RMAX,RMAX); + MN(18,6) XPRIM(VERB, jtresetbloom, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); #if 0 // obsolete but not dead - MN(18,7) XPRIM(VERB, jtsetpermanent, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(18,7) XPRIM(VERB, jtsetpermanent, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); #endif - MN(0,-1) XPRIM(VERB, jtskipinscript, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(0,-1) XPRIM(VERB, jtskipinscript, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(5,-1) XPRIM(VERB, 0, jtoutstr, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,-1) XPRIM(VERB, jtleakblockread, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(9,-2) XPRIM(VERB, jtleakblockreset, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,-1) XPRIM(VERB, jtleakblockread, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(9,-2) XPRIM(VERB, jtleakblockreset, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(9,-3) XPRIM(VERB, jtshowinplacing1, jtshowinplacing2, VASGSAFE|VJTFLGOK1|VJTFLGOK2,VF2NONE,RMAX,RMAX,RMAX); MN(13,-1) XPRIM(VERB, 0, jtfindrange, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,-2) XPRIM(VERB, 0, jtfindrange4, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,-3) XPRIM(VERB, 0, jtfindrange2, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(13,-4) XPRIM(VERB, jthdrinfo, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,-4) XPRIM(VERB, jthdrinfo, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(13,-5) XPRIM(VERB, 0, jtauditpyx, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(13,-6) XPRIM(VERB, jtstackfault, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,-6) XPRIM(VERB, jtstackfault, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); #if PYXES MN(13,-7) XPRIM(VERB, jtnulljob, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); #endif MN(13,-8) XPRIM(VERB, jtcheckfreepool, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(13,-9) XPRIM(VERB, jtsetgetrecurstate, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(13,-10) XPRIM(VERB, jtcallJDo, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,-9) XPRIM(VERB, jtsetgetrecurstate, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(13,-10) XPRIM(VERB, jtcallJDo, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(15,6) XPRIM(VERB, jtdllsymget, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); MN(18,-1) XPRIM(VERB, jtlocmap, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); MN(18,-2) XPRIM(VERB, jtsympool, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(18,-3) XPRIM(VERB, jtlocnlz1, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); - MN(18,-4) XPRIM(VERB, jtlochdr, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(18,-3) XPRIM(VERB, jtlocnlz1, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); + MN(18,-4) XPRIM(VERB, jtlochdr, 0, VFLAGNONE,VF2NONE,RMAX,RMAX,RMAX); // MN(128,110) XPRIM(VERB, jttest1, 0, VASGSAFE,VF2NONE,RMAX,RMAX,RMAX); // TUNE printf("avg # probes=%7.3f\n",(double)totprobes/(double)totslots); diff --git a/jsrc/xa.c b/jsrc/xa.c index d4bcd63d0..010ec6144 100644 --- a/jsrc/xa.c +++ b/jsrc/xa.c @@ -11,6 +11,10 @@ extern uint64_t g_cpuFeatures; extern uint64_t g_cpuFeatures2; extern int numberOfCores; +#ifdef BOXEDSPARSE +extern UC fboxedsparse; +#endif + #include #ifdef _WIN32 #define strncasecmp _strnicmp @@ -313,7 +317,7 @@ F1(jtstackfault){C stackbyte,buf[80],*stackptr=&stackbyte; R 0; } -// 9!:56 undocumented +// 9!:56 // query/override cpu feature F1(jtcpufeature){ ARGCHK1(w); @@ -688,6 +692,25 @@ F2(jtgemmtune2){I j,k; R sc(1); } +// 9!:65 undocumented +// set boxed sparse array capacity 0 disable 1 enable + +// 9!:65 0/1 +F1(jtboxedsparse){I k; +#ifndef BOXEDSPARSE + ASSERT(0,ENONCE); +#else + ARGCHK1(w); + ASSERT(AT(w)&(B01+INT),EVDOMAIN); + ASSERT(1==AN(w),EVLENGTH); + ASSERT(1>=AR(w),EVRANK); + RE(k=i0(w)); // get arg + ASSERT(k==0||k==1,EVDOMAIN); + fboxedsparse=k; + R mtv; +#endif +} + // enable/disable tstack auditing, since some testcases run too long with it enabled // bit 0 is set to disable, bit 1 is a one-shot to ask for an audit // result is old value From 8fad37feb08325d97ee9fc61fb75969198b5382e Mon Sep 17 00:00:00 2001 From: bill lam Date: Sun, 17 Nov 2024 10:43:27 +0800 Subject: [PATCH 2/4] typo --- jsrc/xa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jsrc/xa.c b/jsrc/xa.c index 010ec6144..46bcb62fd 100644 --- a/jsrc/xa.c +++ b/jsrc/xa.c @@ -698,7 +698,7 @@ F2(jtgemmtune2){I j,k; // 9!:65 0/1 F1(jtboxedsparse){I k; #ifndef BOXEDSPARSE - ASSERT(0,ENONCE); + ASSERT(0,EVNONCE); #else ARGCHK1(w); ASSERT(AT(w)&(B01+INT),EVDOMAIN); From 6973bc18140514b5257f1afe5774d47d045914cd Mon Sep 17 00:00:00 2001 From: bill lam Date: Mon, 18 Nov 2024 11:24:40 +0800 Subject: [PATCH 3/4] upstream ss2neon.h; clang 19 workaround --- jsrc/avxintrin-emu.h | 5 ++ jsrc/j.h | 2 + jsrc/sse2neon.h | 125 ++++++++++++++++++------------------------- 3 files changed, 59 insertions(+), 73 deletions(-) diff --git a/jsrc/avxintrin-emu.h b/jsrc/avxintrin-emu.h index 73d6e274a..a1017f13f 100644 --- a/jsrc/avxintrin-emu.h +++ b/jsrc/avxintrin-emu.h @@ -1818,6 +1818,11 @@ static __emu_inline __emu__m256i __emu_mm256_sllv_epi64(__emu__m256i a, __emu__m #define _mm256_xor_pd __emu_mm256_xor_pd #define _mm256_xor_ps __emu_mm256_xor_ps +/* clang 19 defines these macro */ +#undef _mm_cmp_pd +#undef _mm_cmp_ps +#undef _mm_cmp_sd +#undef _mm_cmp_ss #define _mm_cmp_pd __emu_mm_cmp_pd #define _mm256_cmp_pd __emu_mm256_cmp_pd diff --git a/jsrc/j.h b/jsrc/j.h index 41f1ad5ff..7684e598c 100644 --- a/jsrc/j.h +++ b/jsrc/j.h @@ -145,6 +145,8 @@ #if defined(__aarch64__)||defined(_M_ARM64) #if EMU_AVX2 +#undef SSE2NEON_SUPPRESS_WARNINGS +#define SSE2NEON_SUPPRESS_WARNINGS #include #include #include "sse2neon.h" diff --git a/jsrc/sse2neon.h b/jsrc/sse2neon.h index 44896dcfc..80d2fea5c 100644 --- a/jsrc/sse2neon.h +++ b/jsrc/sse2neon.h @@ -106,21 +106,15 @@ #pragma message("Macro name collisions may happen with unsupported compilers.") #endif - -#if defined(__GNUC__) && !defined(__clang__) -#pragma push_macro("FORCE_INLINE_OPTNONE") -#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0"))) -#elif defined(__clang__) -#pragma push_macro("FORCE_INLINE_OPTNONE") -#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone)) -#else -#define FORCE_INLINE_OPTNONE FORCE_INLINE -#endif - #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 #warning "GCC versions earlier than 10 are not supported." #endif +#if defined(__OPTIMIZE__) && !defined(SSE2NEON_SUPPRESS_WARNINGS) +#warning \ + "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon." +#endif + /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const @@ -128,6 +122,7 @@ #define _sse2neon_const const #endif +#include #include #include #include @@ -160,7 +155,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) #include #endif -#if !defined(__cplusplus) && !defined(_MSC_VER) +#if !defined(__cplusplus) #error SSE2NEON only supports C++ compilation with this compiler #endif @@ -339,6 +334,15 @@ FORCE_INLINE void _sse2neon_smp_mb(void) #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) +/** + * MACRO for shuffle parameter for _mm_shuffle_pd(). + * Argument fp1 is a digit[01] that represents the fp from argument "b" + * of mm_shuffle_pd that will be placed in fp1 of result. + * fp0 is a digit[01] that represents the fp from argument "a" of mm_shuffle_pd + * that will be placed in fp0 of result. + */ +#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) + #if __has_builtin(__builtin_shufflevector) #define _sse2neon_shuffle(type, a, b, ...) \ __builtin_shufflevector(a, b, __VA_ARGS__) @@ -604,8 +608,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); FORCE_INLINE __m128d _mm_floor_pd(__m128d); FORCE_INLINE __m128 _mm_floor_ps(__m128); -FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int); -FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int); +FORCE_INLINE __m128d _mm_round_pd(__m128d, int); +FORCE_INLINE __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); @@ -1846,25 +1850,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: + // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error + // cases we treat them as FE_TOWARDZERO (truncate). + return _MM_ROUND_TOWARD_ZERO; } } @@ -2458,46 +2457,28 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE -FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding) +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; + break; + default: + // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, + // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as + // FE_TOWARDZERO (truncate). + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -3899,7 +3880,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 -FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) +FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { // vrnd32xq_f64 not supported on clang #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) @@ -3921,7 +3902,7 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 -FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a) +FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0, d1; @@ -4217,7 +4198,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 -FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a) +FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { double a0, a1; a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); @@ -7559,7 +7540,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) // the rounding parameter, and store the results as packed double-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd -FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding) +FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { #if defined(__aarch64__) || defined(_M_ARM64) switch (rounding) { @@ -7628,7 +7609,7 @@ FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding) // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps -FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding) +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) @@ -9346,8 +9327,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) #endif } -FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode( - unsigned int flag) +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. @@ -9419,7 +9399,6 @@ FORCE_INLINE uint64_t _rdtsc(void) #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") -#pragma pop_macro("FORCE_INLINE_OPTNONE") #endif #if defined(__GNUC__) && !defined(__clang__) From 98f1867ff4765db945bad746d32791257f465b73 Mon Sep 17 00:00:00 2001 From: bill lam Date: Tue, 19 Nov 2024 09:12:36 +0800 Subject: [PATCH 4/4] sse2neon.h patch for windows arm64 --- jsrc/sse2neon.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jsrc/sse2neon.h b/jsrc/sse2neon.h index 80d2fea5c..1eb2df6fd 100644 --- a/jsrc/sse2neon.h +++ b/jsrc/sse2neon.h @@ -155,7 +155,8 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) #include #endif -#if !defined(__cplusplus) +/* && !defined(_MSC_VER) part for windows arm64 */ +#if !defined(__cplusplus) && !defined(_MSC_VER) #error SSE2NEON only supports C++ compilation with this compiler #endif