Skip to content

Commit

Permalink
Inplacing for i./i:/~.-./([-.-.)
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryHRich committed Jan 21, 2025
1 parent d84142d commit 41abf29
Show file tree
Hide file tree
Showing 11 changed files with 236 additions and 161 deletions.
8 changes: 4 additions & 4 deletions jsrc/ar.c
Original file line number Diff line number Diff line change
Expand Up @@ -1013,10 +1013,10 @@ F1(jtslash){F1PREFIP;A h;AF f1;C c;V*v;
v=FAV(w);
I flag=v->flag&VASGSAFE; // if u is asgsafe, so is u/
switch(v->id){ // select the monadic case
case CCOMMA: f1=jtredcat; flag|=VJTFLGOK1; break;
case CCOMDOT: f1=jtredstitch; break;
case CSEMICO: f1=jtredsemi; break;
case CUNDER: f1=jtreduce; if(COPE==IDD(v->fgh[1])){c=FAV(v->fgh[0])->id; if(c==CCOMMA)f1=jtredcateach; else if(c==CCOMDOT)f1=jtredstiteach;} break;
case CCOMMA: f1=jtredcat; flag|=VJTFLGOK1; break; // ,/
case CCOMDOT: f1=jtredstitch; break; // ,./
case CSEMICO: f1=jtredsemi; break; // ;/
case CUNDER: f1=jtreduce; if(COPE==IDD(v->fgh[1])){c=FAV(v->fgh[0])->id; if(c==CCOMMA)f1=jtredcateach; else if(c==CCOMDOT)f1=jtredstiteach;} break; // ,&.>/
default: f1=jtreduce; flag|=(v->flag&VJTFLGOK2)>>(VJTFLGOK2X-VJTFLGOK1X); break; // monad is inplaceable if the dyad for u is
}
RZ(h=qq(w,v2(lr(w),RMAX))); // create the rank compound to use if dyad
Expand Down
13 changes: 7 additions & 6 deletions jsrc/j.h
Original file line number Diff line number Diff line change
Expand Up @@ -638,13 +638,13 @@ struct jtimespec jmtfclk(void); //'fast clock'; maybe less inaccurate; intended
// modes for indexofsub()
#define IIOPMSKX 5 // # bits of flags
#define IIOPMSK (((I)1<<IIOPMSKX)-1) // operation bits. INTER also uses bit 3, which is included as a modifier in the switches
#define IIOPMSKINIT 0xf //
// obsolete #define IIOPMSKINIT 0xf // used to mask the op for initializing botmasks. Then the low-order 4 bits indicate the value to use
#define IIDOT 0 // IIDOT and IICO must be 0-1
#define IICO 1
#define INUBSV 2 // BIT arrays INUBSV-INUBI init to 1 to that out-of-bounds in LESS keeps the value
#define INUB 3
#define ILESS 4
#define INUBI 5
#define ILESS 3 // -.
#define INUB 4 // ~. NUB must be paired with NUBI
#define INUBI 5 // I.@:~:
#define IEPS 6 // BIT arrays IEPS and above init to 0 so out-of-bounds means not included
// the I...EPS values below are wired into the function table at the end of vcompsc.c, where they are combined with a comparison
#define II0EPS 7 // i.&0@:e. this must come first; others base on it
Expand All @@ -656,9 +656,10 @@ struct jtimespec jmtfclk(void); //'fast clock'; maybe less inaccurate; intended
#define IALLEPS 13 // *./@:e.
#define IIFBEPS 14 // I.@e.
#define IFORKEY 15 // special key support: like i.~, but add # values mapped to the index, and return #unique values in AM
#define IINTER 16 // ([ -. -.)
#define IINTER 16 // ([ -. -.) LSBs cause BIT arrays to init to 0 meaning 'don't keep'
#define INUBIP 0x14 // ~. inplace LSBs cause BIT arrays to init to 1
#define IIMODFIELD ((I)7<<IIOPMSKX) // bits used to indicate processing options
#define IIMODPACKX 5
#define IIMODPACKX 5 // must not exceed 5 since used in a shift
#define IIMODPACK (((I)1)<<IIMODPACKX) // modifier for type. (small-range search except i./i:) In IIDOT/IICO, indicates reflexive application. In others, indicates that the
// bitmask should be stored as packed bits rather than bytes
#define IIMODREFLEXX 5 // overlaps IIMODPACK; OK because reflexive i./i: needs to know where the match was & can't use bitmask
Expand Down
2 changes: 1 addition & 1 deletion jsrc/t.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ PRIM primtab[256] = {
/* *. */ PRIMATOMIC2(CSTARDOT,CSTARDOT,VERB, jtpolar, jtatomic2, 0, 0, 0 ,VISATOMIC1|VFUSEDOK2|VIRS2|VASGSAFE|VJTFLGOK2,VF2NONE|VF2PRIM),
/* *: */ PRIMATOMIC2(CSTARCO,CSTARCO, VERB, jtsquare, jtatomic2, 0, 0, 0 ,VISATOMIC1|VFUSEDOK2|VIRS2|VASGSAFE|VJTFLGOK1|VJTFLGOK2,VF2NONE|VF2PRIM),
/* - */ PRIMATOMIC2UV(CMINUS,CMINUS,VA1CNEG-VA1ORIGIN, VERB, jtnegate, jtatomic2, 0, 0, 0 ,VISATOMIC1|VFUSEDOK2|VIRS2|VASGSAFE|VJTFLGOK1|VJTFLGOK2,VF2NONE|VF2PRIM),
/* -. */ PRIMACV(CNOT, VERB, jtnot, jtless, 0, RMAX,RMAX,VISATOMIC1|VASGSAFE|VJTFLGOK1|((7+(((ILESS-II0EPS)&0xf)<<3))&-SY_64),VF2NONE|VF2PRIM), // native compound allowing &n - 64-bit only
/* -. */ PRIMACV(CNOT, VERB, jtnot, jtless, 0, RMAX,RMAX,VISATOMIC1|VASGSAFE|VJTFLGOK1|VJTFLGOK2|((7+(((ILESS-II0EPS)&0xf)<<3))&-SY_64),VF2NONE|VF2PRIM), // native compound allowing &n - 64-bit only
/* -: */ PRIMACV(CHALVE, VERB, jthalve, jtmatch, 0, RMAX,RMAX,VISATOMIC1|VIRS2|VASGSAFE|VJTFLGOK1,VF2NONE|VF2PRIM), // alias CMATCH
/* % */ PRIMATOMIC2UV(CDIV,CDIV,VA1CRECIP-VA1ORIGIN, VERB, jtrecip, jtatomic2, 0, 0, 0 ,VISATOMIC1|VFUSEDOK2|VIRS2|VASGSAFE|VJTFLGOK1|VJTFLGOK2,VF2NONE|VF2PRIM),
/* %. */ PRIMACV(CDOMINO, VERB, jtminv, jtmdiv, 2, RMAX,2 ,VASGSAFE,VF2NONE|VF2PRIM),
Expand Down
49 changes: 31 additions & 18 deletions jsrc/viavx.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ I hashallo(IH * RESTRICT hh,UI p,UI asct,I md){
// ~. ~: I.@~. -. all prefer the table to be complemented and thus initialized to 1.
// REVERSED types always initialize to 1, whether packed or not
// this is a kludge - the initialization value should be passed in by the caller, in asct
UI fillval = md&IREVERSED?(md&IIMODPACK?255:1):((md&(IIMODPACK+IIOPMSK))<=INUBI); fillval|=fillval<<8; fillval|=fillval<<16; // mvc overfetches, so need full UI
mvc(p,hh->data.UI,4,&fillval); // fill with repeated copies of fillval
UI fillval = (((1LL<<INUBSV)|(1LL<<ILESS)|(1LL<<INUB)|(1LL<<INUBI)|(1LL<<INUBIP))>>(md&(IIMODPACK+IIOPMSK)))&1; UI temp=md&IIMODPACK?255:1; fillval=md&IREVERSED?temp:fillval; // mvc overfetches, so need full UI. If PACK, always 0; otherwise look at bits 0-3 of opcode
// obsolete UI fillval = md&IREVERSED?(md&IIMODPACK?255:1):((md&(IIMODPACK+IIOPMSKINIT))<=INUBI); // mvc overfetches, so need full UI. If PACK, always 0; otherwise look at bits 0-3 of opcode
// obsolete fillval|=fillval<<8; fillval|=fillval<<16;
mvc(p,hh->data.UI,1,&fillval); // fill with repeated copies of fillval
// If the invalid area grows, update the invalid hwmk, and also the partition
p >>= hh->hashelelgsize; // convert p to hash index
if(p>hh->invalidhi){
Expand Down Expand Up @@ -767,7 +769,7 @@ A jtindexofsub(J jt,I mode,AD * RESTRICT a,AD * RESTRICT w){F2PREFIP;PROLOG(0079
I zt; A z; // type of result to allocate; address of block
if((mode&IIOPMSK)==IEPS)zt=B01;
else{
if(likely(a!=w)&&(at&INT+SY_64*FL)&-(SGNTO0(AC(w))&(I)jtinplace)){z=w; goto inplace;} // if inplaceable (not including assignment) and items have the right size
if(likely(a!=w)&&(at&INT+SY_64*FL)&-(SGNTO0(AC(w))&((I)jtinplace>>JTINPLACEWX))){z=w; goto inplace;} // if inplaceable (not including assignment) and items have the right size
zt=INT; // the result type depends on the operation.
}
GA(z,zt,wn,wr,AS(w)); // allocate result area
Expand Down Expand Up @@ -1064,22 +1066,33 @@ inplace:;
AF ifn=fntbl[FNTABLEPREFIX+fnx][bighash]; // get an early start fetching the function we will call

// Allocate the result area. NOTE that some of the routines, like small-range, always store at least one result; so we have to point z somewhere harmless before launching them. If we are prehashing we skip this.
// If the conditions are right, perform the operation inplace
A z;
// If the conditions are right, perform the operation inplace.
A z; // will hold result
switch(mode&(IPHCALC|IIOPMSK)){
default: fauxINT(z,zfaux,1,0) break; // if prehashed, we must create an area that can hold at least one stored result
case IIDOT: case IFORKEY:
case IICO: GATV0(z,INT,zn,f+f1); MCISH(AS(z),s,f) MCISH(f+AS(z),ws+wf,f1); break; // mustn't overfetch s
case IIDOT:
case IICO:
// i./i: can run inplace on w if w is abandoned, not the same block as a, rank matches rank needed, item size<=SZI, DIRECT, and (not UNINCORPABLE or same type as result)
if(likely(a!=w)&&likely(!(AFLAG(w)&AFUNINCORPABLE+AFRO))&&(-(AT(w)&(INT+SY_64*FL))&AC(w)&SGNIF(jtinplace,JTINPLACEWX)&((AR(w)^(f+f1))-1))<0){z=w; AT(z)=INT; break;} // inplace w if not disqualified
// if can't inplace fall through to...
case IFORKEY:
GATV0(z,INT,zn,f+f1); MCISH(AS(z),s,f) MCISH(f+AS(z),ws+wf,f1); break; // mustn't overfetch s
case ILESS: case IINTER:
// -./([-.-.) can run inplace if w is abandoned, not the same block as a, rank not 0, DIRECT, not UNINCORPABLE (since we don't want to change shape of an unincorpable)
if(likely(a!=w)&&likely(!(AFLAG(w)&AFUNINCORPABLE+AFRO))&&(((AT(w)&~DIRECT)-1)&AC(w)&SGNIF(jtinplace,JTINPLACEWX)&-(AR(w)))<0){z=w; break;} // inplace w if not disqualified
ws=wr==0?&AN(w):ws; GA(z,AT(w),AN(w),MAX(1,wr),ws); break; // if wr is an atom, use 1 for the shape
case INUB:
// ~. can run inplace if abandoned, rank>0, DIRECT
if(likely(!(AFLAG(w)&AFUNINCORPABLE+AFRO))&&(((AT(w)&~DIRECT)-1)&AC(w)&SGNIF(jtinplace,JTINPLACEWX)&-(AR(w)))<0){z=w; mode^=INUB^INUBIP; break;} // inplace w if not disqualified
{I q; PRODX(q,AR(a)-1,AS(a)+1,MIN(m,p+1)) GA(z,t,q,MAX(1,wr),ws); break;} // we speculatively overwrite, possibly 1 more than in a but no more than in w
case INUBI: GATV0(z,INT,MIN(m,p)+1,1); break; // we speculatively overwrite but never past a full buffer
case INUBSV: GATV0(z,B01,zn,f+f1+!acr); MCISH(AS(z),s,f) MCISH(f+AS(z),ws+wf,f1); if(!acr)AS(z)[AR(z)-1]=1; break; // mustn't overfetch s
case INUB: {I q; PRODX(q,AR(a)-1,AS(a)+1,MIN(m,p)+1) GA(z,t,q,MAX(1,wr),ws); break;} // +1 because we speculatively overwrite.
case ILESS: case IINTER: ws=wr==0?&AN(w):ws; GA(z,AT(w),AN(w),MAX(1,wr),ws); break; // if wr is an atom, use 1 for the shape
case IEPS: GATV0(z,B01,zn,f+f1); MCISH(AS(z),s,f) MCISH(f+AS(z),ws+wf,f1); break;
case INUBI: GATV0(z,INT,MIN(m,p)+1,1); break; // +1 because we speculatively overwrite
// (e. i. 0:) and friends don't do anything useful if e. produces rank > 1. The search for 0/1 always fails
case II0EPS: case II1EPS: case IJ0EPS: case IJ1EPS:
if(wr>MAX(ar,1))R sc(wr>r?ws[0]:1); GAT0(z,INT,1,0); break;
// ([: I. e.) ([: +/ e.) ([: +./ e.) ([: *./ e.) come here only if e. produces rank 0 or 1.
case IIFBEPS: GATV0(z,INT,c+1,1); break; // +1 because we speculatively overwrite
case IIFBEPS: GATV0(z,INT,c,1); break; // we speculatively overwrite but not past a full buffer
case IANYEPS: case IALLEPS:
GAT0(z,B01,1,0); break;
case ISUMEPS:
Expand All @@ -1099,7 +1112,7 @@ inplace:;
case IFORKEY: {z=reshape(shape(z),take(sc(m),sc(m))); RZ(z=mkwris(z)); AM(z)=!!m; R z;} // all 0 but the first has the total count. Must install # partitions=1 if #items>0
case IICO: R reshape(shape(z),sc(n?m:m-1));
case INUBSV: R reshape(shape(z),take(sc(m),num(1)));
case INUB: AN(z)=0; AS(z)[0]=m?1:0; R z;
case INUB: case INUBIP: AN(z)=0; AS(z)[0]=m?1:0; R z;
case ILESS: if(m&&fnx==-3)AN(z)=AS(z)[0]=0; else MC(AV(z),AV(w),AN(w)<<bplg(AT(w))); R z;
case IINTER: if(!(m&&fnx==-3))AN(z)=AS(z)[0]=0; R z; // y has atoms or something is empty, return all of w; otherwise empty
case IEPS: R reshape(shape(z),num(m&&(!n||(fnx&1)))); // fnx&1 is true if homo
Expand Down Expand Up @@ -1197,7 +1210,7 @@ A jtindexofprehashed(J jt,A a,A w,A hs,A self){A h,*hv,x,z;AF fn;I ar,*as,at,c,f
R z;
}

// x i. y, supports inplacing
// x i. y, supports inplacing (in subroutine)
F2(jtindexof){
if(unlikely(((UI)a^(UI)ds(CALP))<(UI)(AT(w)&LIT))&&likely(!ISSPARSE(AT(w)))){F2PREFIP; R jtadotidot(jt,w);}
R indexofsub(IIDOT,a,w);
Expand All @@ -1220,13 +1233,13 @@ F1(jtnubsieve){
F1(jtnub){
F1PREFIP;ARGCHK1(w);
if(unlikely((SGNIFSPARSE(AT(w))|SGNIF(AFLAG(w),AFNJAX))<0))R repeat(nubsieve(w),w); // sparse or NJA
A z; RZ(z=indexofsub(INUB,w,w));
A z; RZ(z=jtindexofsub(jtinplace,INUB,w,w));
// We extracted from w, so mark it (or its backer if virtual) non-pristine. If w was pristine and inplaceable, transfer its pristine status to the result. We overwrite w because it is no longer in use
PRISTXFERF(z,w)
RETF(z);
} /* ~.w */

// x -. y. does not have IRS
// x -. y. does not have IRS, support inplacing
F2(jtless){A x=w;I ar,at,k,r,*s,wr,*ws;
F2PREFIP;ARGCHK2(a,w);
at=AT(a); ar=AR(a);
Expand All @@ -1237,14 +1250,14 @@ F2(jtless){A x=w;I ar,at,k,r,*s,wr,*ws;
if(unlikely((-wr&-(r^wr))<0)){RZ(x=virtual(w,0,r)); AN(x)=wn; s=AS(x); ws=AS(w); k=ar>wr?0:1+wr-r; I s0; PRODX(s0,k,ws,1) s[0]=s0; MCISH(1+s,k+ws,r-1);} // use fauxvirtual here
// if nothing special (like sparse, or incompatible types, or x requires conversion) do the fast way; otherwise (-. x e. y) # x
// because LESS allocates a large array to hold all the values, we use the slower, less memory-intensive, version if a is mapped
RZ(x=(SGNIFSPARSE(at)|SGNIF(AFLAG(a),AFNJAX))>=0?indexofsub(ILESS,x,a):
RZ(x=(SGNIFSPARSE(at)|SGNIF(AFLAG(a),AFNJAX))>=0?jtindexofsub(jtinplace,ILESS,x,a):
repeat(not(eps(a,x)),a));
// We extracted from a, so mark it (or its backer if virtual) non-pristine. If a was pristine and inplaceable, transfer its pristine status to the result
PRISTXFERAF(x,a)
RETF(x);
} /* a-.w */

// x ([ -. -.[!.f]) y. does not have IRS
// x ([ -. -.[!.f]) y. does not have IRS, supports inplacing
DF2(jtintersect){A x=w;I ar,at,k,r,*s,wr,*ws;
F2PREFIP;ARGCHK2(a,w);
at=AT(a); ar=AR(a);
Expand All @@ -1259,7 +1272,7 @@ DF2(jtintersect){A x=w;I ar,at,k,r,*s,wr,*ws;
// if nothing special (like sparse, or incompatible types, or x requires conversion) do the fast way; otherwise (-. x e. y) # x
// because LESS allocates a large array to hold all the values, we use the slower, less memory-intensive, version if a is mapped
// Don't revert to fork! localuse.lu1.fork2hfn is not set
x=(SGNIFSPARSE(at)|SGNIF(AFLAG(a),AFNJAX))>=0?indexofsub(IINTER,x,a):
x=(SGNIFSPARSE(at)|SGNIF(AFLAG(a),AFNJAX))>=0?jtindexofsub(jtinplace,IINTER,x,a):
repeat(eps(a,x),a);
POPCCT
RZ(x);
Expand Down
15 changes: 10 additions & 5 deletions jsrc/viavx.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@
#define HASHSLOT(hash) j=((hash)*p)>>32;

// Misc code to set the shape once we see how many results there are, used for ~. y and x -. y
#define ZISHAPE AS(z)[0]=AN(z)=zi-zv
#define ZISHAPE AS(z)[0]=AN(z)=zi-zv // zi must point to a single atom (sc. an index)
#define ZCSHAPE AS(z)[0]=(zc-(C*)zv)/k; AN(z)=n*AS(z)[0]
#define ZCSHAPEI AN(z)=n*(AS(z)[0]=(zc-zv)) // I *zc points to a SZI-sized cell but might not be an atom
#define ZUSHAPE(T) AS(z)[0]= zu-(T*)zv; AN(z)=n*AS(z)[0]

// Calculate the hash slot. The hash calculation (input parm) relies on the name v and produces the name j. We have moved v to an xmm register to reduce register pressure
Expand All @@ -78,11 +79,12 @@
// If (store) is 1, the value of i (which is the loop index giving the position within a of the item being processed) is stored into the empty hash slot,
// only if the hash search does not find a match. If (store) is 2, the entry that we found is cleared, by setting it to maxcount+1, when we find a match.
// When (store)=2, we also ignore hash entries containing maxcount+1, treating them as failed compares
// (store)=3 is ~. inplace:
// Independent of (store), (fstmt) is executed if the item is found in the hash table, and (nfstmt) is executed if it is not found.
#define FINDP(T,TH,hsrc,name,exp,fstmt,nfstmt,store) NOUNROLL do{if(hj==hsrc##sct){ \
if(store==1)hv[name]=(TH)i; nfstmt break;} /* this is the not-found case */ \
#define FINDP(T,TH,hsrc,name,exp,fstmt,nfstmt,store) NOUNROLL do{ \
if(hj==hsrc##sct){if(store==1)hv[name]=(TH)i; if(store==3)hv[name]=wsct; nfstmt break;} /* this is the not-found case */ \
if((store!=2||hj<hsrc##sct)&&(v=(T*)_mm_extract_epi64(vp,1),!(exp))){if(store==2)hv[name]=(TH)(hsrc##sct+1); fstmt break;} /* found */ \
if(unlikely(--name<0))name+=p; hj=hv[name]; /* miscompare, nust continue search */ \
if(unlikely(--name<0))name+=p; hj=hv[name]; /* miscompare, must continue search */ \
}while(1);

// Traverse the hash table for one argument. (src) indicates which argument, a or w, we are looping through; (hsrc) indicates which argument provided the hash table.
Expand All @@ -92,6 +94,7 @@
// q+2 is being calculated).
// The (fstmt,nfstmt,store) arguments indicate what to do when a match/notmatch is resolved.
// (loopctl) give the stride through the input array, the control for the main loop, and the index of the last value. These values differ for forward and reverse scans through the input.
// in the loop i is the index of the item being looked up in the hash (if store is not 0 or 3, that will be the index stored into the hashtable on notfound)
#define XSEARCH(T,TH,src,hsrc,hash,exp,stride,fstmt,nfstmt,store,vpofst,loopctl,finali) \
{I i, j, hj; T *v; vp=_mm_insert_epi64(vp,(I)(src##v+vpofst),0); vpstride = _mm_insert_epi64(vp,(stride)*(I)sizeof(T),0); vp=_mm_shuffle_epi32(vp,0x44); vpstride=_mm_insert_epi64(vpstride,0LL,1); \
HASHSLOTP(T,hash) if(src##sct>1){I j1,j2; vp=_mm_add_epi64(vp,vpstride); j1=j; HASHSLOTP(T,hash) hj=hv[j1]; vp=_mm_add_epi64(vp,vpstride); vpstride=_mm_shuffle_epi32(vpstride,0x44); \
Expand All @@ -100,8 +103,10 @@

// Traverse a in forward direction, adding values to the hash table
#define XDOAP(T,TH,hash,exp,stride) XSEARCH(T,TH,a,a,hash,exp,stride,{},{},1,0, (i=0;i<asct-2;++i) ,asct-1)
// Traverse w in forward direction, executing fstmt/nfstmt depending on found/notfound; and adding to the hash if (reflex) is 1, indicating a reflexive operation
// Traverse w in forward direction, executing fstmt/nfstmt depending on found/notfound; and adding to the hash if (reflex) is 1, indicating a reflexive operation scaf could save a register if reflexive
#define XDOP(T,TH,hash,exp,stride,fstmt,nfstmt,reflex) XSEARCH(T,TH,w,a,hash,exp,stride,fstmt,nfstmt,reflex,0, (i=0;i<wsct-2;++i) ,wsct-1)
// version used for ~. inplace: reflex=3 (controls FINDP), only a is used. wsct is freed for use as stored-item count
#define XDOPIP(T,TH,hash,exp,stride,fstmt,nfstmt,reflex) XSEARCH(T,TH,a,a,hash,exp,stride,fstmt,nfstmt,reflex,0, (i=0;i<asct-2;++i) ,asct-1)
// same for traversing a/w in reverse
#define XDQAP(T,TH,hash,exp,stride) XSEARCH(T,TH,a,a,hash,exp,(-(stride)),{},{},1,cn*(asct-1), (i=asct-1;i>1;--i) ,0)
#define XDQP(T,TH,hash,exp,stride,fstmt,nfstmt,reflex) XSEARCH(T,TH,w,a,hash,exp,(-(stride)),fstmt,nfstmt,reflex,cn*(wsct-1), (i=wsct-1;i>1;--i) ,0)
Expand Down
Loading

0 comments on commit 41abf29

Please sign in to comment.